From db33f7727cd1dc284ee7a2a9ca3435fe3dae69d6 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 15 Apr 2024 16:22:40 +0300 Subject: [PATCH] Update "dhp-stats-update" workflow to use "spark"-actions, instead of "hive" ones. Note: Currently the code is set to only test the "Step1". --- .../dhp-stats-update/installProject.sh | 18 + .../dhp-stats-update/runOozieWorkfow.sh | 20 + .../graph/stats/oozie_app/scripts/step1.sql | 4 +- .../graph/stats/oozie_app/scripts/step10.sql | 24 +- .../graph/stats/oozie_app/scripts/step11.sql | 18 +- .../graph/stats/oozie_app/scripts/step12.sql | 34 +- .../graph/stats/oozie_app/scripts/step13.sql | 34 +- .../graph/stats/oozie_app/scripts/step14.sql | 30 +- .../graph/stats/oozie_app/scripts/step15.sql | 26 +- .../stats/oozie_app/scripts/step15_5.sql | 36 +- .../scripts/step16_1-definitions.sql | 12 +- .../stats/oozie_app/scripts/step16_5.sql | 20 +- .../graph/stats/oozie_app/scripts/step2.sql | 38 +- .../scripts/step21-createObservatoryDB.sql | 38 +- .../graph/stats/oozie_app/scripts/step3.sql | 38 +- .../graph/stats/oozie_app/scripts/step4.sql | 36 +- .../graph/stats/oozie_app/scripts/step5.sql | 36 +- .../graph/stats/oozie_app/scripts/step6.sql | 30 +- .../graph/stats/oozie_app/scripts/step7.sql | 30 +- .../graph/stats/oozie_app/scripts/step8.sql | 36 +- .../graph/stats/oozie_app/scripts/step9.sql | 8 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 443 +++++++++++++----- 22 files changed, 627 insertions(+), 382 deletions(-) create mode 100755 dhp-workflows/dhp-stats-update/installProject.sh create mode 100755 dhp-workflows/dhp-stats-update/runOozieWorkfow.sh diff --git a/dhp-workflows/dhp-stats-update/installProject.sh b/dhp-workflows/dhp-stats-update/installProject.sh new file mode 100755 index 000000000..afd95578d --- /dev/null +++ b/dhp-workflows/dhp-stats-update/installProject.sh @@ -0,0 +1,18 @@ +# Install the whole "dnet-hadoop" project. + +# Delete this module's previous build-files in order to avoid any conflicts. +rm -rf target/ || + +# Go to the root directory of this project. +cd ../../ + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Install the project. +mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true + +# We skip tests for all modules, since the take a big amount of time and some of them fail. +# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script. diff --git a/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh new file mode 100755 index 000000000..a4825a3ae --- /dev/null +++ b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh @@ -0,0 +1,20 @@ +# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file. + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Build and deploy this module. +mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \ + -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats + +# Show the Oozie-job-ID. +echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n" +cat ./target/extract-and-run-on-remote-host.log + +# Check oozie workflow status +# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info + +# Get the from the previous output and check the logs: +# yarn logs -applicationId diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql index 9697a1dc8..467a98872 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql @@ -4,5 +4,5 @@ -------------------------------------------------------------- -------------------------------------------------------------- -DROP database IF EXISTS ${stats_db_name} CASCADE; -CREATE database ${stats_db_name}; +DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/ +CREATE database ${stats_db_name}; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index bbd7b3bbc..9088ce205 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -5,27 +5,27 @@ ------------------------------------------------------------------------------------------------ CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * -FROM ${external_stats_db_name}.fundref; +FROM ${external_stats_db_name}.fundref; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * -FROM ${external_stats_db_name}.country; +FROM ${external_stats_db_name}.country; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * -FROM ${external_stats_db_name}.countrygdp; +FROM ${external_stats_db_name}.countrygdp; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * -FROM ${external_stats_db_name}.roarmap; +FROM ${external_stats_db_name}.roarmap; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * -FROM ${external_stats_db_name}.rndexpediture; +FROM ${external_stats_db_name}.rndexpediture; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS SELECT * -FROM ${external_stats_db_name}.licenses_normalized; +FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ @@ -33,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ create or replace view ${stats_db_name}.usage_stats as -select * from openaire_prod_usage_stats.usage_stats; +select * from openaire_prod_usage_stats.usage_stats; /*EOS*/ create or replace view ${stats_db_name}.downloads_stats as -select * from openaire_prod_usage_stats.downloads_stats; +select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/ create or replace view ${stats_db_name}.pageviews_stats as -select * from openaire_prod_usage_stats.pageviews_stats; +select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/ create or replace view ${stats_db_name}.views_stats as -select * from openaire_prod_usage_stats.views_stats; +select * from openaire_prod_usage_stats.views_stats; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; +DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/ create table ${stats_db_name}.creation_date STORED AS PARQUET as -select date_format(current_date(), 'dd-MM-yyyy') as date; +select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 638fb0f7a..06600db19 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -10,7 +10,7 @@ SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); + WHERE d.id = rd.datasource); /*EOS*/ -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables UPDATE ${stats_db_name}.project_tmp @@ -19,8 +19,8 @@ WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result = r.id - AND r.type = 'publication'); -DROP TABLE IF EXISTS ${stats_db_name}.stored purge; + AND r.type = 'publication'); /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.stored purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, @@ -63,7 +63,7 @@ FROM ${stats_db_name}.project_tmp p AND r.type = 'publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; + ON prr2.id = p.id; /*EOS*/ UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' @@ -73,7 +73,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' @@ -83,7 +83,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' @@ -93,7 +93,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' @@ -103,7 +103,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, @@ -116,4 +116,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; \ No newline at end of file + AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 0a1904de7..ff95524be 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,42 +1,42 @@ ------------------------------------------------------------------------------------------------------ -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * -FROM ${stats_db_name}.datasource_tmp; +FROM ${stats_db_name}.datasource_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * -FROM ${stats_db_name}.publication_tmp; +FROM ${stats_db_name}.publication_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * -FROM ${stats_db_name}.dataset_tmp; +FROM ${stats_db_name}.dataset_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software purge; +DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * -FROM ${stats_db_name}.software_tmp; +FROM ${stats_db_name}.software_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * -FROM ${stats_db_name}.otherresearchproduct_tmp; +FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.project_tmp; -DROP TABLE ${stats_db_name}.datasource_tmp; -DROP TABLE ${stats_db_name}.publication_tmp; -DROP TABLE ${stats_db_name}.dataset_tmp; -DROP TABLE ${stats_db_name}.software_tmp; -DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; +DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ ---------------------------------------------- -- Re-creating views from final parquet tables @@ -54,4 +54,4 @@ SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct; +FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 8c1dbdc4d..68a46ded3 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -5,7 +5,7 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -16,9 +16,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -29,9 +29,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -42,9 +42,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -55,7 +55,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS SELECT * FROM ${stats_db_name}.publication_sources @@ -64,9 +64,9 @@ SELECT * FROM ${stats_db_name}.dataset_sources UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid @@ -76,9 +76,9 @@ from ( LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type - WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; + WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype @@ -91,9 +91,9 @@ where reltype='resultResult' and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations @@ -108,9 +108,9 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(target, 4); +group by substr(target, 4); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references @@ -125,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(source, 4); \ No newline at end of file +group by substr(source, 4); /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index f50c13521..f61c70221 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -5,33 +5,33 @@ -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses @@ -40,15 +40,15 @@ SELECT * FROM ${stats_db_name}.dataset_licenses UNION ALL SELECT * FROM ${stats_db_name}.software_licenses UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid -from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; +from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource @@ -58,10 +58,10 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result -lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; +lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index ce6b6cc2f..7c618fd0f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -6,7 +6,7 @@ ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -22,9 +22,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -40,9 +40,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -58,9 +58,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -76,7 +76,7 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed @@ -85,17 +85,17 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; +select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; +DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids -where measures_ids.id!='views' and measures_ids.id!='downloads'; +where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, @@ -104,4 +104,4 @@ rel.properties[1].value apc_currency from ${openaire_db_name}.relation rel join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.result r on r.id=rel.target -where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; \ No newline at end of file +where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 6ed686a05..54743e046 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,25 +1,25 @@ ------------------------------------------- --- Extra tables, mostly used by indicators -DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/ create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( @@ -33,17 +33,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='other' then rcount.count else 0 end) as other from rcount -group by rcount.pid; +group by rcount.pid; /*EOS*/ -create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; -create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; -create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; -create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; -create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; -create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; +create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/ +create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/ +create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/ +create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/ +create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/ create table if not exists ${stats_db_name}.result_instance stored as parquet as select distinct r.* @@ -51,9 +51,9 @@ from ( select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r -join ${stats_db_name}.result res on res.id=r.id; +join ${stats_db_name}.result res on res.id=r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as select distinct r.id, r.amount, r.currency @@ -61,6 +61,6 @@ from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; +where r.amount is not null; /*EOS*/ -create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file +create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index b55af13d4..399381b12 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,26 +3,26 @@ ---------------------------------------------------- -- Peer reviewed: -drop table if exists ${stats_db_name}.result_peerreviewed purge; +drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id -left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ -- Green OA: -drop table if exists ${stats_db_name}.result_greenoa purge; +drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r -left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/ -- GOLD OA: -drop table if exists ${stats_db_name}.result_gold purge; +drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as select r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r - left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 7faa91697..1b838ca1b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,6 +1,6 @@ -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- peer reviewed) -drop table if exists ${stats_db_name}.result_tmp; +drop table if exists ${stats_db_name}.result_tmp; /*EOS*/ CREATE TABLE ${stats_db_name}.result_tmp ( id STRING, @@ -20,37 +20,37 @@ CREATE TABLE ${stats_db_name}.result_tmp ( peer_reviewed BOOLEAN, green BOOLEAN, gold BOOLEAN) -clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); +clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.publication r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.dataset r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.software r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.otherresearchproduct r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ -drop table if exists ${stats_db_name}.result; -drop view if exists ${stats_db_name}.result; -create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; \ No newline at end of file +drop table if exists ${stats_db_name}.result; /*EOS*/ +drop view if exists ${stats_db_name}.result; /*EOS*/ +create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/ +drop table ${stats_db_name}.result_tmp; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 8e56f98fc..4aa90b1a2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -5,7 +5,7 @@ -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_tmp ( id STRING, @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.publication_tmp abstract BOOLEAN, type STRING ) - clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); + clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, @@ -39,17 +39,17 @@ SELECT substr(p.id, 4) as id, case when size(p.description) > 0 then true else false end as abstract, 'publication' as type from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -58,9 +58,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -71,44 +71,44 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file + and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 66620ac38..adcf23b7a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -5,7 +5,7 @@ from ${stats_db_name}.result r select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count from ${stats_db_name}.result_licenses rl group by rl.id -) rln on rln.id=r.id; +) rln on rln.id=r.id; /*EOS*/ create table ${observatory_db_name}.result_affiliated_country stored as parquet as @@ -35,7 +35,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year stored as parquet as @@ -65,7 +65,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as @@ -95,7 +95,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as @@ -127,7 +127,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select @@ -158,7 +158,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select @@ -187,7 +187,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select @@ -216,7 +216,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select @@ -247,7 +247,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select @@ -278,7 +278,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_country stored as parquet as select @@ -309,7 +309,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_year stored as parquet as select @@ -340,7 +340,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_deposited_year_country stored as parquet as @@ -372,7 +372,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select @@ -403,7 +403,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select @@ -434,7 +434,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization stored as parquet as select @@ -465,7 +465,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select @@ -496,7 +496,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder stored as parquet as select @@ -529,7 +529,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select @@ -562,4 +562,4 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 0384de4ec..1ff4beadb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,7 +5,7 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_tmp ( @@ -23,7 +23,7 @@ CREATE TABLE ${stats_db_name}.dataset_tmp abstract BOOLEAN, type STRING ) - clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); + clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, @@ -40,26 +40,26 @@ SELECT substr(d.id, 4) AS id, CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, 'dataset' AS type FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; +WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; + and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -68,9 +68,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource @@ -82,35 +82,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index d8f4d65e4..426d53773 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,7 +5,7 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_tmp ( id STRING, @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.software_tmp abstract BOOLEAN, type STRING ) - clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); + clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, @@ -39,24 +39,24 @@ SELECT substr(s.id, 4) as id, CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'software' as type from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; + and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -65,9 +65,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource @@ -79,35 +79,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index fae0fbb63..6b5adff9d 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,7 +5,7 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp source STRING, abstract BOOLEAN, type STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, @@ -39,23 +39,23 @@ SELECT substr(o.id, 4) AS id, CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'other' AS type FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/ -- Otherresearchproduct_citations -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; + and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -63,9 +63,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource @@ -74,32 +74,32 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 165f77946..75ec7d69c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,38 +3,38 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_tmp ( @@ -61,7 +61,7 @@ CREATE TABLE ${stats_db_name}.project_tmp totalcost FLOAT, fundedamount FLOAT, currency STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, @@ -88,18 +88,18 @@ SELECT substr(p.id, 4) AS id, p.fundedamount AS fundedamount, p.currency.value AS currency FROM ${openaire_db_name}.project p -WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.funder purge; +DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, @@ -107,4 +107,4 @@ properties[0].value contribution, properties[1].value currency from ${openaire_db_name}.relation r LATERAL VIEW explode (r.properties) properties where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' -and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index eb16a161e..2cc7c13c4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -16,7 +16,7 @@ SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset_tmp UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct_tmp; +FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ -- Views on final tables CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS @@ -30,7 +30,7 @@ SELECT * FROM ${stats_db_name}.dataset_datasources UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_datasources; +FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS SELECT * @@ -43,7 +43,7 @@ SELECT * FROM ${stats_db_name}.dataset_citations UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_citations; +FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS SELECT * @@ -56,7 +56,7 @@ SELECT * FROM ${stats_db_name}.dataset_classifications UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_classifications; +FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS SELECT * @@ -69,7 +69,7 @@ SELECT * FROM ${stats_db_name}.dataset_concepts UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_concepts; +FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS SELECT * @@ -82,7 +82,7 @@ SELECT * FROM ${stats_db_name}.dataset_languages UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_languages; +FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS SELECT * @@ -95,7 +95,7 @@ SELECT * FROM ${stats_db_name}.dataset_oids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_oids; +FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * @@ -108,7 +108,7 @@ SELECT * FROM ${stats_db_name}.dataset_pids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_pids; +FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * @@ -121,9 +121,9 @@ SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_topics; +FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/ create table ${stats_db_name}.result_fos stored as parquet as with @@ -133,22 +133,22 @@ with select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.target like '50|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 07204db0c..3f40dbb21 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,7 +5,7 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_tmp ( @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp `compatibility` STRING, issn_printed STRING, issn_online STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ -- Insert statement that takes into account the piwik_id of the openAIRE graph INSERT INTO ${stats_db_name}.datasource_tmp @@ -46,14 +46,14 @@ FROM ${openaire_db_name}.datasource d1 LATERAL VIEW EXPLODE(originalid) temp AS originalidd WHERE originalidd like "piwik:%") AS d2 ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; +WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/ -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/ -INSERT INTO ${stats_db_name}.dual VALUES ('X'); +INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/ INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) @@ -71,42 +71,42 @@ SELECT 'other', null, null FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -DROP TABLE ${stats_db_name}.dual; +WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/ +DROP TABLE ${stats_db_name}.dual; /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; +FROM ${stats_db_name}.result_datasources; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index 19d301e27..afde8160e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -3,7 +3,7 @@ -- Organization table/view and Organization related tables/views ---------------------------------------------------------------- ---------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS SELECT substr(o.id, 4) as id, @@ -11,12 +11,12 @@ SELECT substr(o.id, 4) as id, o.legalshortname.value as legalshortname, o.country.classid as country FROM ${openaire_db_name}.organization o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource -FROM ${stats_db_name}.datasource_organizations; +FROM ${stats_db_name}.datasource_organizations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id -FROM ${stats_db_name}.project_organizations; \ No newline at end of file +FROM ${stats_db_name}.project_organizations; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 813fffcf9..1460477ae 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -154,180 +154,354 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - - + + yarn + cluster + Step1 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + + + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step2 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step3 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step4 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step6 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step7 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step8 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step9 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step10 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step11 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step12 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step13 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step14 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step15 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step15_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + @@ -379,23 +553,45 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_1-definitions + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + @@ -461,12 +657,23 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - observatory_db_name=${observatory_db_name} - + + yarn + cluster + Step21-createObservatoryDB + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} +