From 12749a0a776d5ca1b9f1f59f4809b8f717bac228 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 26 Nov 2021 15:40:40 +0200 Subject: [PATCH 1/6] first --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 16 ++++------ .../oa/graph/stats/oozie_app/finalizedb.sh | 4 +-- .../graph/stats/oozie_app/scripts/step10.sql | 23 +++++++-------- .../graph/stats/oozie_app/scripts/step11.sql | 2 +- .../graph/stats/oozie_app/scripts/step13.sql | 10 +++---- .../graph/stats/oozie_app/scripts/step14.sql | 29 +++++-------------- .../graph/stats/oozie_app/scripts/step15.sql | 10 +++---- .../stats/oozie_app/scripts/step15_5.sql | 14 ++++----- .../scripts/step16_1-definitions.sql | 6 ++-- .../graph/stats/oozie_app/scripts/step2.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step3.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step4.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step5.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step6.sql | 8 ++--- .../graph/stats/oozie_app/scripts/step7.sql | 4 +-- .../graph/stats/oozie_app/scripts/step8.sql | 10 +++---- 16 files changed, 88 insertions(+), 112 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index b66ab47e0..fafb45cb0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -30,18 +30,14 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} echo "Creating and populating impala tables" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" -impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" -impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" +hive -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" +hive -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" +hive -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" +hive -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" echo "Cleaning up" -hdfs dfs -rm -f -r -skipTrash ${TMP} rm concepts.csv rm categories.csv rm contexts.csv diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index d04c5ccfd..8eade479a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,9 +10,7 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -impala-shell -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - impala-shell -q "create database if not exists ${SHADOW}" impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index fc0162a9c..2808d0a3a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -4,29 +4,28 @@ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS -SELECT * -FROM ${external_stats_db_name}.fundref; +SELECT * FROM ${external_stats_db_name}.fundref; CREATE OR REPLACE VIEW ${stats_db_name}.country AS -SELECT * -FROM ${external_stats_db_name}.country; +SELECT * FROM ${external_stats_db_name}.country; CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS -SELECT * -FROM ${external_stats_db_name}.countrygdp; +SELECT * FROM ${external_stats_db_name}.countrygdp; CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS -SELECT * -FROM ${external_stats_db_name}.roarmap; +SELECT * FROM ${external_stats_db_name}.roarmap; CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS -SELECT * -FROM ${external_stats_db_name}.rndexpediture; +SELECT * FROM ${external_stats_db_name}.rndexpediture; CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS -SELECT * -FROM ${external_stats_db_name}.licenses_normalized; +SELECT * FROM ${external_stats_db_name}.licenses_normalized; +create view ${stats_db_name}.rndexpenditure as +select * from stats_ext.rndexpediture; + +create view ${stats_db_name}.issn_gold_oa_dataset as +select * from stats_ext.issn_gold_oa_dataset; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index e892da0be..280c1bb51 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -102,7 +102,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index e4e81175c..6e63e2836 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -5,7 +5,7 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -16,7 +16,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -27,7 +27,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -38,7 +38,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -59,7 +59,7 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -create table ${stats_db_name}.result_orcid as +create table ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 00a6913bc..f38ad886e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -5,27 +5,27 @@ -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses STORED AS PARQUET AS SELECT * FROM ${stats_db_name}.publication_licenses UNION ALL SELECT * FROM ${stats_db_name}.dataset_licenses @@ -34,11 +34,11 @@ SELECT * FROM ${stats_db_name}.software_licenses UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource FROM ( SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource @@ -46,17 +46,4 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; - --- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 8e66e05c0..f293c1fd3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -6,27 +6,27 @@ ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 3a7d9f455..06c425dcd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,21 +1,21 @@ ------------------------------------------- --- Extra tables, mostly used by indicators -create table ${stats_db_name}.result_projectcount as +create table ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.result_fundercount as +create table ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.project_resultcount as +create table ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( select p.id as pid, count(distinct r.id) as `count`, r.type as type from ${stats_db_name}.project p @@ -29,8 +29,6 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; - create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -39,12 +37,10 @@ from ( from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; -create table ${stats_db_name}.result_apc as +create table ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; - -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +where r.amount is not null; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 6b4d9b1b0..5a4aecd5d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,20 +3,20 @@ ---------------------------------------------------- -- Peer reviewed: -create table ${stats_db_name}.result_peerreviewed as +create table ${stats_db_name}.result_peerreviewed stored as parquet as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; -- Green OA: -create table ${stats_db_name}.result_greenoa as +create table ${stats_db_name}.result_greenoa stored as parquet as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: -create table ${stats_db_name}.result_gold as +create table ${stats_db_name}.result_gold stored as parquet as select r.id, case when gold.gold_oa=1 then true else false end as gold from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index bb0d0ac6c..dfe6246ca 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -40,13 +40,13 @@ SELECT substr(p.id, 4) as id, from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_classifications AS +CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_concepts AS +CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -55,7 +55,7 @@ from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_datasources as +CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource @@ -66,30 +66,30 @@ FROM ( from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.publication_languages AS +CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_oids AS +CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_pids AS +CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_topics as +CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_citations AS +CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 953eaad6a..4841944a2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -40,20 +40,20 @@ SELECT substr(d.id, 4) AS id, FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference = FALSE; -CREATE TABLE ${stats_db_name}.dataset_citations AS +CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and d.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_classifications AS +CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_concepts AS +CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -62,7 +62,7 @@ from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_datasources AS +CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -74,24 +74,24 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.dataset_languages AS +CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_oids AS +CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_pids AS +CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_topics AS +CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0210dc8cb..390a5e338 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -40,20 +40,20 @@ SELECT substr(s.id, 4) as id, from ${openaire_db_name}.software s where s.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_citations AS +CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and s.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_classifications AS +CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_concepts AS +CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -62,7 +62,7 @@ FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_datasources AS +CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -74,24 +74,24 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.software_languages AS +CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_oids AS +CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_pids AS +CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_topics AS +CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index f7b302186..65c2f0c33 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -40,18 +40,18 @@ FROM ${openaire_db_name}.otherresearchproduct o WHERE o.datainfo.deletedbyinference = FALSE; -- Otherresearchproduct_citations -CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -59,7 +59,7 @@ SELECT substr(p.id, 4) as id, case FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance @@ -68,22 +68,22 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 378e0f17b..c75659c46 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,7 +3,7 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE ${stats_db_name}.project_oids AS +CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; CREATE TABLE ${stats_db_name}.project_organizations AS @@ -12,13 +12,13 @@ from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.project_results AS +CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.datainfo.deletedbyinference = false; -create table ${stats_db_name}.project_classification as +create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class @@ -74,7 +74,7 @@ SELECT substr(p.id, 4) AS id, FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false; -create table ${stats_db_name}.funder as +create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index b3cbc9b41..99fa47767 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -123,13 +123,13 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -CREATE TABLE ${stats_db_name}.result_organization AS +CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.result_projects AS +CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 76d31eb5e..6753d8190 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -80,15 +80,15 @@ UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -CREATE TABLE ${stats_db_name}.datasource_languages AS +CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages; -CREATE TABLE ${stats_db_name}.datasource_oids AS +CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; -CREATE TABLE ${stats_db_name}.datasource_organizations AS +CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'datasourceOrganization' @@ -96,11 +96,11 @@ WHERE r.reltype = 'datasourceOrganization' -- datasource sources: -- where the datasource info have been collected from. -create table if not exists ${stats_db_name}.datasource_sources AS +create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; \ No newline at end of file From d05210ba9991c9276b5cb7e5f1258379fa0834b6 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 30 Nov 2021 19:01:48 +0200 Subject: [PATCH 2/6] finished migration to hive only --- .../dhp/oa/graph/stats/oozie_app/finalizedb.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/indicators.sh | 6 +++--- .../dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/observatory-post.sh | 12 +++++++----- .../dhp/oa/graph/stats/oozie_app/observatory-pre.sh | 7 ++++--- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 8eade479a..60771dfa7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,8 +10,11 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo +hive -f foo +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 93faa43d6..72d6d8048 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo +hive --database ${TARGET} -f step16-createIndicatorsTables.sql echo "Indicators created" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index c5bda6d39..37809652d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -15,11 +15,14 @@ echo "Getting file from " $4 hdfs dfs -copyToLocal $4 echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index db8d39af2..d074e6a55 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -10,12 +10,14 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow observatory database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index 55a308c50..be009cd45 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -11,6 +11,7 @@ export TARGET=$2 export SHADOW=$3 echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +hive -e "drop database if exists ${TARGET} cascade" +hive -e "create database if not exists ${TARGET}" +hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo From 915f758c82f366e45388b80e3bfd42b5388d75df Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 16:26:14 +0200 Subject: [PATCH 3/6] moving data to impala cluster and creating shadow databases there --- .../oa/graph/stats/oozie_app/finalizedb.sh | 8 +---- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 10 +----- .../graph/stats/oozie_app/observatory-post.sh | 10 +----- .../graph/stats/oozie_app/scripts/step8.sql | 23 +++++------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 36 +++++++++++++++++-- 5 files changed, 45 insertions(+), 42 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 60771dfa7..9de472955 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -11,10 +11,4 @@ export SHADOW=$2 echo "Updating shadow database" hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive -f foo -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +hive -f foo \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 37809652d..a4e7eec57 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -17,12 +17,4 @@ hdfs dfs -copyToLocal $4 echo "Creating monitor database" cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow monitor database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index d074e6a55..12315c9e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -12,12 +12,4 @@ export SHADOW=$3 hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow observatory database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 6753d8190..33e1e3527 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual -( - dummy CHAR(1) -); -INSERT INTO ${stats_db_name}.dual -VALUES ('X'); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); + +INSERT INTO ${stats_db_name}.dual VALUES ('X'); + INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) SELECT 'other', @@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; -UPDATE ${stats_db_name}.datasource_tmp -SET name='Other' -WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp -SET yearofvalidation=null -WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language @@ -91,8 +85,7 @@ FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' - and r.datainfo.deletedbyinference = false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false; -- datasource sources: -- where the datasource info have been collected from. @@ -101,6 +94,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 08d33f4e8..7ac3cefbb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -365,11 +365,43 @@ ${observatory_db_shadow_name} observatory-post.sh - + - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${external_stats_db_name} + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + ${jobTracker} ${nameNode} From ddd34087c25395f135ef0c075264095c62691a67 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 23:05:00 +0200 Subject: [PATCH 4/6] removed 'stored as parquet' from views.. --- .../oozie_app/copyDataToImpalaCluster.sh | 57 +++++++++++++++++++ .../stats/oozie_app/finalizeImpalaCluster.sh | 27 +++++++++ .../graph/stats/oozie_app/scripts/step11.sql | 7 +-- .../graph/stats/oozie_app/scripts/step14.sql | 2 +- .../graph/stats/oozie_app/scripts/step15.sql | 2 +- 5 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..9846eb66a --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,57 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function copydb() { + db=$1 + + # copy the databases from ocean to impala + + #echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp + + # change ownership to impala + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + echo "creating schema for ${db}" + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # run the same command twice because we may have failures in the first run (due to views pointing to the same db) + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db +} + +STATS_DB=$1 +MONITOR_DB=$2 +OBSERVATORY_DB=$3 +EXT_DB=$4 + +copydb $EXT_DB +copydb $STATS_DB +copydb $MONITOR_DB +copydb $OBSERVATORY_DB + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..31107c7ed --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,27 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createShadowDB() { + SOURCE=$1 + SHADOW=$2 + + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +} + +STATS_DB=$1 +STATS_DB_SHADOW=$2 +MONITOR_DB=$3 +MONITOR_DB_SHADOW=$4 +OBSERVATORY_DB=$5 +OBSERVATORY_DB_SHADOW=$6 + +createShadowDB $STATS_DB $STATS_DB_SHADOW +createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW +createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 280c1bb51..d699b68c3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -102,7 +102,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, @@ -113,7 +113,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; - --- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + AND project.id = result_projects.project; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index f38ad886e..e9c2e014c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -25,7 +25,7 @@ SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses STORED AS PARQUET AS +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses UNION ALL SELECT * FROM ${stats_db_name}.dataset_licenses diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index f293c1fd3..58e7c7c11 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed From 0353f93d54c948df36f01b4da4498fd2264375eb Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Apr 2022 12:49:27 +0300 Subject: [PATCH 5/6] added new hive opts --- .../dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh | 9 +++++---- .../dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 10 +++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 72d6d8048..fef569b59 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,12 +8,13 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo -hive -f foo -hive --database ${TARGET} -f step16-createIndicatorsTables.sql -echo "Indicators created" \ No newline at end of file +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo +hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql +echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 7ac3cefbb..5661e1e73 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -67,10 +67,14 @@ hive.txn.timeout ${hive_timeout} + + mapred.job.queue.name + analytics + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -276,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - + @@ -414,4 +418,4 @@ - \ No newline at end of file + From 6fc9ef53f64ac137a51e4da11a0bd1f3ef36304b Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Jul 2022 16:36:20 +0300 Subject: [PATCH 6/6] addded command line params to allow hive actions to run --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 19 ++++++++++++------- .../oa/graph/stats/oozie_app/indicators.sh | 3 ++- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index fafb45cb0..0ce57e095 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -9,6 +9,8 @@ fi CONTEXT_API=$1 TARGET_DB=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -280,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - +