From 6f2ebb2a52fa99735e92867ffdfbd701926f3dd8 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 18 Apr 2024 15:35:03 +0300 Subject: [PATCH] Revert Step8 and Step11 to use Hive again, since their "UPDATE" statements are not supported by Spark. --- .../graph/stats/oozie_app/scripts/step11.sql | 18 +++---- .../graph/stats/oozie_app/scripts/step8.sql | 37 +++++++------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 49 ++++++++++++++----- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index f4d06587b..207c1b124 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -10,7 +10,7 @@ SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); /*EOS*/ + WHERE d.id = rd.datasource); -- /*EOS*/ -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables UPDATE ${stats_db_name}.project_tmp @@ -19,9 +19,9 @@ WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result = r.id - AND r.type = 'publication'); /*EOS*/ + AND r.type = 'publication'); -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.project purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, @@ -64,7 +64,7 @@ FROM ${stats_db_name}.project_tmp p AND r.type = 'publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; /*EOS*/ + ON prr2.id = p.id; -- /*EOS*/ UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' @@ -74,7 +74,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' @@ -84,7 +84,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' @@ -94,7 +94,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' @@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, @@ -117,4 +117,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file + AND project.id = result_projects.project; -- /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 23fa743f9..07e19d68b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,7 +5,7 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_tmp ( @@ -23,6 +23,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp issn_printed STRING, issn_online STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/ -- Insert statement that takes into account the piwik_id of the openAIRE graph INSERT INTO ${stats_db_name}.datasource_tmp @@ -46,16 +47,16 @@ FROM ${openaire_db_name}.datasource d1 LATERAL VIEW EXPLODE(originalid) temp AS originalidd WHERE originalidd like "piwik:%") AS d2 ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/ +WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- /*EOS*/ -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -DROP TABLE IF EXISTS ${stats_db_name}.dual purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.dual purge; -- /*EOS*/ -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/ +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); -- /*EOS*/ -INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/ +INSERT INTO ${stats_db_name}.dual VALUES ('X'); -- /*EOS*/ INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) @@ -73,42 +74,42 @@ SELECT 'other', null, null FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/ -DROP TABLE ${stats_db_name}.dual; /*EOS*/ +WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -- /*EOS*/ +DROP TABLE ${stats_db_name}.dual; -- /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -- /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; /*EOS*/ +FROM ${stats_db_name}.result_datasources; -- /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index c2c6f9822..5c255a488 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -308,7 +308,7 @@ - + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + + + @@ -375,7 +386,7 @@ - + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} + + +