From 12749a0a776d5ca1b9f1f59f4809b8f717bac228 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 26 Nov 2021 15:40:40 +0200 Subject: [PATCH 01/47] first --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 16 ++++------ .../oa/graph/stats/oozie_app/finalizedb.sh | 4 +-- .../graph/stats/oozie_app/scripts/step10.sql | 23 +++++++-------- .../graph/stats/oozie_app/scripts/step11.sql | 2 +- .../graph/stats/oozie_app/scripts/step13.sql | 10 +++---- .../graph/stats/oozie_app/scripts/step14.sql | 29 +++++-------------- .../graph/stats/oozie_app/scripts/step15.sql | 10 +++---- .../stats/oozie_app/scripts/step15_5.sql | 14 ++++----- .../scripts/step16_1-definitions.sql | 6 ++-- .../graph/stats/oozie_app/scripts/step2.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step3.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step4.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step5.sql | 16 +++++----- .../graph/stats/oozie_app/scripts/step6.sql | 8 ++--- .../graph/stats/oozie_app/scripts/step7.sql | 4 +-- .../graph/stats/oozie_app/scripts/step8.sql | 10 +++---- 16 files changed, 88 insertions(+), 112 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index b66ab47e0..fafb45cb0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -30,18 +30,14 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} echo "Creating and populating impala tables" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" -impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" -impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" +hive -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" +hive -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" +hive -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" +hive -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" echo "Cleaning up" -hdfs dfs -rm -f -r -skipTrash ${TMP} rm concepts.csv rm categories.csv rm contexts.csv diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index d04c5ccfd..8eade479a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,9 +10,7 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -impala-shell -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - impala-shell -q "create database if not exists ${SHADOW}" impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index fc0162a9c..2808d0a3a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -4,29 +4,28 @@ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS -SELECT * -FROM ${external_stats_db_name}.fundref; +SELECT * FROM ${external_stats_db_name}.fundref; CREATE OR REPLACE VIEW ${stats_db_name}.country AS -SELECT * -FROM ${external_stats_db_name}.country; +SELECT * FROM ${external_stats_db_name}.country; CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS -SELECT * -FROM ${external_stats_db_name}.countrygdp; +SELECT * FROM ${external_stats_db_name}.countrygdp; CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS -SELECT * -FROM ${external_stats_db_name}.roarmap; +SELECT * FROM ${external_stats_db_name}.roarmap; CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS -SELECT * -FROM ${external_stats_db_name}.rndexpediture; +SELECT * FROM ${external_stats_db_name}.rndexpediture; CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS -SELECT * -FROM ${external_stats_db_name}.licenses_normalized; +SELECT * FROM ${external_stats_db_name}.licenses_normalized; +create view ${stats_db_name}.rndexpenditure as +select * from stats_ext.rndexpediture; + +create view ${stats_db_name}.issn_gold_oa_dataset as +select * from stats_ext.issn_gold_oa_dataset; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index e892da0be..280c1bb51 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -102,7 +102,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index e4e81175c..6e63e2836 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -5,7 +5,7 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -16,7 +16,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -27,7 +27,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -38,7 +38,7 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource @@ -59,7 +59,7 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -create table ${stats_db_name}.result_orcid as +create table ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 00a6913bc..f38ad886e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -5,27 +5,27 @@ -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses STORED AS PARQUET AS SELECT * FROM ${stats_db_name}.publication_licenses UNION ALL SELECT * FROM ${stats_db_name}.dataset_licenses @@ -34,11 +34,11 @@ SELECT * FROM ${stats_db_name}.software_licenses UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS +CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource FROM ( SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource @@ -46,17 +46,4 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; - --- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 8e66e05c0..f293c1fd3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -6,27 +6,27 @@ ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 3a7d9f455..06c425dcd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,21 +1,21 @@ ------------------------------------------- --- Extra tables, mostly used by indicators -create table ${stats_db_name}.result_projectcount as +create table ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.result_fundercount as +create table ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.project_resultcount as +create table ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( select p.id as pid, count(distinct r.id) as `count`, r.type as type from ${stats_db_name}.project p @@ -29,8 +29,6 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; - create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -39,12 +37,10 @@ from ( from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; -create table ${stats_db_name}.result_apc as +create table ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; - -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +where r.amount is not null; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 6b4d9b1b0..5a4aecd5d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,20 +3,20 @@ ---------------------------------------------------- -- Peer reviewed: -create table ${stats_db_name}.result_peerreviewed as +create table ${stats_db_name}.result_peerreviewed stored as parquet as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; -- Green OA: -create table ${stats_db_name}.result_greenoa as +create table ${stats_db_name}.result_greenoa stored as parquet as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: -create table ${stats_db_name}.result_gold as +create table ${stats_db_name}.result_gold stored as parquet as select r.id, case when gold.gold_oa=1 then true else false end as gold from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index bb0d0ac6c..dfe6246ca 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -40,13 +40,13 @@ SELECT substr(p.id, 4) as id, from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_classifications AS +CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_concepts AS +CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -55,7 +55,7 @@ from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_datasources as +CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource @@ -66,30 +66,30 @@ FROM ( from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.publication_languages AS +CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_oids AS +CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_pids AS +CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_topics as +CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_citations AS +CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 953eaad6a..4841944a2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -40,20 +40,20 @@ SELECT substr(d.id, 4) AS id, FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference = FALSE; -CREATE TABLE ${stats_db_name}.dataset_citations AS +CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and d.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_classifications AS +CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_concepts AS +CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -62,7 +62,7 @@ from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_datasources AS +CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -74,24 +74,24 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.dataset_languages AS +CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_oids AS +CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_pids AS +CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_topics AS +CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0210dc8cb..390a5e338 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -40,20 +40,20 @@ SELECT substr(s.id, 4) as id, from ${openaire_db_name}.software s where s.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_citations AS +CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and s.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_classifications AS +CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_concepts AS +CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -62,7 +62,7 @@ FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_datasources AS +CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -74,24 +74,24 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.software_languages AS +CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_oids AS +CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_pids AS +CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_topics AS +CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index f7b302186..65c2f0c33 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -40,18 +40,18 @@ FROM ${openaire_db_name}.otherresearchproduct o WHERE o.datainfo.deletedbyinference = FALSE; -- Otherresearchproduct_citations -CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') @@ -59,7 +59,7 @@ SELECT substr(p.id, 4) as id, case FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance @@ -68,22 +68,22 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS +CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 378e0f17b..c75659c46 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,7 +3,7 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ -CREATE TABLE ${stats_db_name}.project_oids AS +CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; CREATE TABLE ${stats_db_name}.project_organizations AS @@ -12,13 +12,13 @@ from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.project_results AS +CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.datainfo.deletedbyinference = false; -create table ${stats_db_name}.project_classification as +create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class @@ -74,7 +74,7 @@ SELECT substr(p.id, 4) AS id, FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false; -create table ${stats_db_name}.funder as +create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index b3cbc9b41..99fa47767 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -123,13 +123,13 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -CREATE TABLE ${stats_db_name}.result_organization AS +CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.result_projects AS +CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 76d31eb5e..6753d8190 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -80,15 +80,15 @@ UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -CREATE TABLE ${stats_db_name}.datasource_languages AS +CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages; -CREATE TABLE ${stats_db_name}.datasource_oids AS +CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; -CREATE TABLE ${stats_db_name}.datasource_organizations AS +CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'datasourceOrganization' @@ -96,11 +96,11 @@ WHERE r.reltype = 'datasourceOrganization' -- datasource sources: -- where the datasource info have been collected from. -create table if not exists ${stats_db_name}.datasource_sources AS +create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; \ No newline at end of file From d05210ba9991c9276b5cb7e5f1258379fa0834b6 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 30 Nov 2021 19:01:48 +0200 Subject: [PATCH 02/47] finished migration to hive only --- .../dhp/oa/graph/stats/oozie_app/finalizedb.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/indicators.sh | 6 +++--- .../dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/observatory-post.sh | 12 +++++++----- .../dhp/oa/graph/stats/oozie_app/observatory-pre.sh | 7 ++++--- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 8eade479a..60771dfa7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,8 +10,11 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo +hive -f foo +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 93faa43d6..72d6d8048 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo +hive --database ${TARGET} -f step16-createIndicatorsTables.sql echo "Indicators created" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index c5bda6d39..37809652d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -15,11 +15,14 @@ echo "Getting file from " $4 hdfs dfs -copyToLocal $4 echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index db8d39af2..d074e6a55 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -10,12 +10,14 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow observatory database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index 55a308c50..be009cd45 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -11,6 +11,7 @@ export TARGET=$2 export SHADOW=$3 echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +hive -e "drop database if exists ${TARGET} cascade" +hive -e "create database if not exists ${TARGET}" +hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo From 915f758c82f366e45388b80e3bfd42b5388d75df Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 16:26:14 +0200 Subject: [PATCH 03/47] moving data to impala cluster and creating shadow databases there --- .../oa/graph/stats/oozie_app/finalizedb.sh | 8 +---- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 10 +----- .../graph/stats/oozie_app/observatory-post.sh | 10 +----- .../graph/stats/oozie_app/scripts/step8.sql | 23 +++++------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 36 +++++++++++++++++-- 5 files changed, 45 insertions(+), 42 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 60771dfa7..9de472955 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -11,10 +11,4 @@ export SHADOW=$2 echo "Updating shadow database" hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive -f foo -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +hive -f foo \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 37809652d..a4e7eec57 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -17,12 +17,4 @@ hdfs dfs -copyToLocal $4 echo "Creating monitor database" cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow monitor database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index d074e6a55..12315c9e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -12,12 +12,4 @@ export SHADOW=$3 hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow observatory database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 6753d8190..33e1e3527 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual -( - dummy CHAR(1) -); -INSERT INTO ${stats_db_name}.dual -VALUES ('X'); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); + +INSERT INTO ${stats_db_name}.dual VALUES ('X'); + INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) SELECT 'other', @@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; -UPDATE ${stats_db_name}.datasource_tmp -SET name='Other' -WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp -SET yearofvalidation=null -WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language @@ -91,8 +85,7 @@ FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' - and r.datainfo.deletedbyinference = false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false; -- datasource sources: -- where the datasource info have been collected from. @@ -101,6 +94,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 08d33f4e8..7ac3cefbb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -365,11 +365,43 @@ ${observatory_db_shadow_name} observatory-post.sh - + - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${external_stats_db_name} + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + ${jobTracker} ${nameNode} From ddd34087c25395f135ef0c075264095c62691a67 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 23:05:00 +0200 Subject: [PATCH 04/47] removed 'stored as parquet' from views.. --- .../oozie_app/copyDataToImpalaCluster.sh | 57 +++++++++++++++++++ .../stats/oozie_app/finalizeImpalaCluster.sh | 27 +++++++++ .../graph/stats/oozie_app/scripts/step11.sql | 7 +-- .../graph/stats/oozie_app/scripts/step14.sql | 2 +- .../graph/stats/oozie_app/scripts/step15.sql | 2 +- 5 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..9846eb66a --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,57 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function copydb() { + db=$1 + + # copy the databases from ocean to impala + + #echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp + + # change ownership to impala + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + echo "creating schema for ${db}" + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # run the same command twice because we may have failures in the first run (due to views pointing to the same db) + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db +} + +STATS_DB=$1 +MONITOR_DB=$2 +OBSERVATORY_DB=$3 +EXT_DB=$4 + +copydb $EXT_DB +copydb $STATS_DB +copydb $MONITOR_DB +copydb $OBSERVATORY_DB + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..31107c7ed --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,27 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createShadowDB() { + SOURCE=$1 + SHADOW=$2 + + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +} + +STATS_DB=$1 +STATS_DB_SHADOW=$2 +MONITOR_DB=$3 +MONITOR_DB_SHADOW=$4 +OBSERVATORY_DB=$5 +OBSERVATORY_DB_SHADOW=$6 + +createShadowDB $STATS_DB $STATS_DB_SHADOW +createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW +createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 280c1bb51..d699b68c3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -102,7 +102,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, @@ -113,7 +113,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; - --- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + AND project.id = result_projects.project; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index f38ad886e..e9c2e014c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -25,7 +25,7 @@ SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses STORED AS PARQUET AS +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses UNION ALL SELECT * FROM ${stats_db_name}.dataset_licenses diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index f293c1fd3..58e7c7c11 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed From 0353f93d54c948df36f01b4da4498fd2264375eb Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Apr 2022 12:49:27 +0300 Subject: [PATCH 05/47] added new hive opts --- .../dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh | 9 +++++---- .../dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 10 +++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 72d6d8048..fef569b59 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,12 +8,13 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo -hive -f foo -hive --database ${TARGET} -f step16-createIndicatorsTables.sql -echo "Indicators created" \ No newline at end of file +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo +hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql +echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 7ac3cefbb..5661e1e73 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -67,10 +67,14 @@ hive.txn.timeout ${hive_timeout} + + mapred.job.queue.name + analytics + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -276,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - + @@ -414,4 +418,4 @@ - \ No newline at end of file + From 6fc9ef53f64ac137a51e4da11a0bd1f3ef36304b Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Jul 2022 16:36:20 +0300 Subject: [PATCH 06/47] addded command line params to allow hive actions to run --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 19 ++++++++++++------- .../oa/graph/stats/oozie_app/indicators.sh | 3 ++- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index fafb45cb0..0ce57e095 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -9,6 +9,8 @@ fi CONTEXT_API=$1 TARGET_DB=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -280,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - + From e84dd5fe2668d9b45998a5b77798a56c7252a704 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 26 Nov 2021 15:40:40 +0200 Subject: [PATCH 07/47] first --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 16 ++++++---------- .../dhp/oa/graph/stats/oozie_app/finalizedb.sh | 4 +--- .../oa/graph/stats/oozie_app/scripts/step11.sql | 2 +- .../oa/graph/stats/oozie_app/scripts/step15.sql | 2 +- .../graph/stats/oozie_app/scripts/step15_5.sql | 8 ++------ .../oa/graph/stats/oozie_app/scripts/step8.sql | 2 +- 6 files changed, 12 insertions(+), 22 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index b66ab47e0..fafb45cb0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -30,18 +30,14 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} echo "Creating and populating impala tables" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" -impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" -impala-shell -d ${TARGET_DB} -q "invalidate metadata" -impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" -impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" -impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" +hive -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" +hive -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" +hive -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" +hive -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" +hive -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" echo "Cleaning up" -hdfs dfs -rm -f -r -skipTrash ${TMP} rm concepts.csv rm categories.csv rm contexts.csv diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index d04c5ccfd..8eade479a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,9 +10,7 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -impala-shell -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "invalidate metadata" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - impala-shell -q "create database if not exists ${SHADOW}" impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 41c3ed751..122947050 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index cec22cd3e..9b5c6ec06 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 86ead4a2c..9b2630286 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -29,8 +29,6 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; - create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -39,12 +37,10 @@ from ( from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; -create table ${stats_db_name}.result_apc as +create table ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; - -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +where r.amount is not null; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 01bed17cc..f089693fc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -102,6 +102,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; \ No newline at end of file From 778a1a724f9b5f129c629d8dc77cd0e81c40c30d Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 30 Nov 2021 19:01:48 +0200 Subject: [PATCH 08/47] finished migration to hive only --- .../dhp/oa/graph/stats/oozie_app/finalizedb.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/indicators.sh | 6 +++--- .../dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh | 11 +++++++---- .../dhp/oa/graph/stats/oozie_app/observatory-post.sh | 12 +++++++----- .../dhp/oa/graph/stats/oozie_app/observatory-pre.sh | 7 ++++--- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 8eade479a..60771dfa7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -10,8 +10,11 @@ export SOURCE=$1 export SHADOW=$2 echo "Updating shadow database" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f - -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo +hive -f foo +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 93faa43d6..72d6d8048 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo +hive --database ${TARGET} -f step16-createIndicatorsTables.sql echo "Indicators created" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index c5bda6d39..37809652d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -15,11 +15,14 @@ echo "Getting file from " $4 hdfs dfs -copyToLocal $4 echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index db8d39af2..d074e6a55 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -10,12 +10,14 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive -f foo echo "Impala shell finished" echo "Updating shadow observatory database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +hive -e "create database if not exists ${SHADOW}" +hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo +hive -f foo +hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo +hive -f foo echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index 55a308c50..be009cd45 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -11,6 +11,7 @@ export TARGET=$2 export SHADOW=$3 echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +hive -e "drop database if exists ${TARGET} cascade" +hive -e "create database if not exists ${TARGET}" +hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo From 2754c3dd62e2c7bf60ebd00cdc7a0ec9ab9a80c1 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 16:26:14 +0200 Subject: [PATCH 09/47] moving data to impala cluster and creating shadow databases there --- .../oa/graph/stats/oozie_app/finalizedb.sh | 8 +---- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 10 +----- .../graph/stats/oozie_app/observatory-post.sh | 10 +----- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 36 +++++++++++++++++-- 4 files changed, 37 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 60771dfa7..9de472955 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -11,10 +11,4 @@ export SHADOW=$2 echo "Updating shadow database" hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive -f foo -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +hive -f foo \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 37809652d..a4e7eec57 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -17,12 +17,4 @@ hdfs dfs -copyToLocal $4 echo "Creating monitor database" cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow monitor database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index d074e6a55..12315c9e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -12,12 +12,4 @@ export SHADOW=$3 hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive -f foo -echo "Impala shell finished" - -echo "Updating shadow observatory database" -hive -e "create database if not exists ${SHADOW}" -hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo -hive -f foo -hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo -hive -f foo -echo "Shadow db ready!" \ No newline at end of file +echo "Impala shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 08d33f4e8..7ac3cefbb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -365,11 +365,43 @@ ${observatory_db_shadow_name} observatory-post.sh - + - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${external_stats_db_name} + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + ${jobTracker} ${nameNode} From 1ddea4f4423011568195e41f67863fd1a1140059 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 13 Dec 2021 23:05:00 +0200 Subject: [PATCH 10/47] removed 'stored as parquet' from views.. --- .../oozie_app/copyDataToImpalaCluster.sh | 57 +++++++++++++++++++ .../stats/oozie_app/finalizeImpalaCluster.sh | 27 +++++++++ .../graph/stats/oozie_app/scripts/step11.sql | 2 +- .../graph/stats/oozie_app/scripts/step15.sql | 2 +- 4 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..9846eb66a --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,57 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function copydb() { + db=$1 + + # copy the databases from ocean to impala + + #echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp + + # change ownership to impala + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + echo "creating schema for ${db}" + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # run the same command twice because we may have failures in the first run (due to views pointing to the same db) + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db +} + +STATS_DB=$1 +MONITOR_DB=$2 +OBSERVATORY_DB=$3 +EXT_DB=$4 + +copydb $EXT_DB +copydb $STATS_DB +copydb $MONITOR_DB +copydb $OBSERVATORY_DB + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..31107c7ed --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,27 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createShadowDB() { + SOURCE=$1 + SHADOW=$2 + + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +} + +STATS_DB=$1 +STATS_DB_SHADOW=$2 +MONITOR_DB=$3 +MONITOR_DB_SHADOW=$4 +OBSERVATORY_DB=$5 +OBSERVATORY_DB_SHADOW=$6 + +createShadowDB $STATS_DB $STATS_DB_SHADOW +createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW +createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 122947050..41c3ed751 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id AND pr.id = p.id AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 9b5c6ec06..cec22cd3e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; -CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as +CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed union all select * from ${stats_db_name}.dataset_refereed From 028873cc512de8d528a3cc5dd52de028b66f809c Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Apr 2022 12:49:27 +0300 Subject: [PATCH 11/47] added new hive opts --- .../dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh | 9 +++++---- .../dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 10 +++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 72d6d8048..fef569b59 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,12 +8,13 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo -hive -f foo -hive --database ${TARGET} -f step16-createIndicatorsTables.sql -echo "Indicators created" \ No newline at end of file +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo +hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql +echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 7ac3cefbb..5661e1e73 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -67,10 +67,14 @@ hive.txn.timeout ${hive_timeout} + + mapred.job.queue.name + analytics + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -276,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - + @@ -414,4 +418,4 @@ - \ No newline at end of file + From c8309fe18ecb09a1a223c43e61804429d8f1f779 Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 29 Jul 2022 16:36:20 +0300 Subject: [PATCH 12/47] addded command line params to allow hive actions to run --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 19 ++++++++++++------- .../oa/graph/stats/oozie_app/indicators.sh | 3 ++- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index fafb45cb0..0ce57e095 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -9,6 +9,8 @@ fi CONTEXT_API=$1 TARGET_DB=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -280,7 +280,7 @@ ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh - + From dcb958e1467bf53761d826d74e7bc107f6ab6d91 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 4 Jan 2023 11:39:01 +0200 Subject: [PATCH 13/47] Changes to execute the stats wf only in hive --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 4 +- .../oa/graph/stats/oozie_app/finalizedb.sh | 6 +- .../oa/graph/stats/oozie_app/indicators.sh | 4 +- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 15 +- .../graph/stats/oozie_app/observatory-post.sh | 2 +- .../stats/oozie_app/scripts/step15_5.sql | 11 +- .../scripts/step16-createIndicatorsTables.sql | 693 +++++++----------- .../scripts/step20-createMonitorDB.sql | 105 +-- .../scripts/step21-createObservatoryDB.sql | 40 +- .../graph/stats/oozie_app/scripts/step8.sql | 2 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 144 ++-- 11 files changed, 472 insertions(+), 554 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 0ce57e095..e152eb1ee 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -31,8 +31,8 @@ hdfs dfs -copyFromLocal categories.csv ${TMP} hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} -export HADOOP_USER="antonis.lempesis" -export HADOOP_USER_NAME="antonis.lempesis" +export HADOOP_USER="dimitris.pierrakos" +export HADOOP_USER_NAME="dimitris.pierrakos" echo "Creating and populating impala tables" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 9de472955..011cfcc28 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -8,7 +8,9 @@ fi export SOURCE=$1 export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" echo "Updating shadow database" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive -f foo \ No newline at end of file +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index fd95c8514..6c76e35f2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,8 +8,8 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" -export HADOOP_USER="antonis.lempesis" +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index a4e7eec57..25095f4d3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -11,10 +11,15 @@ export TARGET=$2 export SHADOW=$3 export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH + echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo -hive -f foo -echo "Impala shell finished" \ No newline at end of file +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index 12315c9e8..fafafe59a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -12,4 +12,4 @@ export SHADOW=$3 hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive -f foo -echo "Impala shell finished" \ No newline at end of file +echo "Hive shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 9b2630286..1ae856355 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -29,6 +29,13 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; +create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; +create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; +create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; +create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; +create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; + create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -43,4 +50,6 @@ from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; \ No newline at end of file +where r.amount is not null; + +create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 1bda07629..ac4d4202a 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,5 +1,5 @@ -- Sprint 1 ---- -create table indi_pub_green_oa stored as parquet as +create table if not exists indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p left outer join ( @@ -12,9 +12,9 @@ from publication p or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; -compute stats indi_pub_green_oa; +ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS; -create table indi_pub_grey_lit stored as parquet as +create table if not exists indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p left outer join ( @@ -25,9 +25,9 @@ from publication p not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; -compute stats indi_pub_grey_lit; +ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS; -create table indi_pub_doi_from_crossref stored as parquet as +create table if not exists indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p left outer join @@ -36,10 +36,10 @@ from publication p where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; -compute stats indi_pub_doi_from_crossref; +ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- -create table indi_result_has_cc_licence stored as parquet as +create table if not exists indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -47,9 +47,9 @@ from result r where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence; +ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS; -create table indi_result_has_cc_licence_url stored as parquet as +create table if not exists indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from result r left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -58,31 +58,31 @@ from result r WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence_url; +ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table indi_pub_has_abstract stored as parquet as -select distinct publication.id, coalesce(abstract, 1) has_abstract +create table if not exists indi_pub_has_abstract stored as parquet as +select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from publication; -compute stats indi_pub_has_abstract; +ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS; -create table indi_result_with_orcid stored as parquet as +create table if not exists indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp on r.id= tmp.id; -compute stats indi_result_with_orcid; +ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- -create table indi_funded_result_with_fundref stored as parquet as +create table if not exists indi_funded_result_with_fundref stored as parquet as select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp on r.result= tmp.result; -compute stats indi_funded_result_with_fundref; +ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; -- create table indi_result_org_collab stored as parquet as -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations @@ -92,77 +92,59 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_collab; -- -create table indi_result_org_collab stored as parquet as -with tmp as ( -select distinct ro.organization organization, ro.id from result_organization ro -join organization o on o.id=ro.organization where o.name is not null) +create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro +join organization o on o.id=ro.organization where o.name is not null; + +create table if not exists indi_result_org_collab stored as parquet as select o1.organization org1, o2.organization org2, count(o1.id) as collaborations from tmp as o1 -join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization -group by org1, org2; +join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization +group by o1.organization, o2.organization; -compute stats indi_result_org_collab; +drop table tmp purge; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; --- -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select distinct ro.organization organization, ro.id, o.country from result_organization ro -join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) +ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; + +create TEMPORARY TABLE tmp AS +select distinct ro.organization organization, ro.id, o.country from result_organization ro +join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_org_country_collab stored as parquet as select o1.organization org1,o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o2.country; -compute stats indi_result_org_country_collab; +drop table tmp purge; --- create table indi_result_org_collab stored as parquet as --- with tmp as --- (select o.id, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id) --- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id --- group by o1.id, o2.id, o1.type; --- --- compute stats indi_result_org_collab; --- -create table indi_project_collab_org stored as parquet as +ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; + +create table if not exists indi_project_collab_org stored as parquet as select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations from organization_projects as o1 join organization_projects as o2 on o1.project=o2.project where o1.id!=o2.id group by o1.id, o2.id; -compute stats indi_project_collab_org; +ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; -create table indi_project_collab_org_country stored as parquet as - with tmp as - (select o.id organization, o.country , ro.project as project from organization o +create TEMPORARY TABLE tmp AS +select o.id organization, o.country , ro.project as project from organization o join organization_projects ro on o.id=ro.id - and o.country <> 'UNKNOWN') + and o.country <> 'UNKNOWN'; + +create table if not exists indi_project_collab_org_country stored as parquet as select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country group by o1.organization, o2.country; -compute stats indi_project_collab_org_country; +drop table tmp purge; -create table indi_funder_country_collab stored as parquet as +ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS; + +create table if not exists indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project @@ -173,36 +155,26 @@ from tmp as f1 where f1.country<>f2.country group by f1.funder, f2.country, f1.country; -compute stats indi_funder_country_collab; --- --- create table indi_result_country_collab stored as parquet as --- with tmp as --- (select country, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where country <> 'UNKNOWN') --- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.country<>o2.country --- group by o1.country, o2.country, o1.type; --- --- compute stats indi_result_country_collab; +ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS; -create table indi_result_country_collab stored as parquet as -with tmp as - (select distinct country, ro.id as result from organization o +create TEMPORARY TABLE tmp AS +select distinct country, ro.id as result from organization o join result_organization ro on o.id=ro.organization - where country <> 'UNKNOWN' and o.name is not null) + where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_country_collab stored as parquet as select o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country; -compute stats indi_result_country_collab; +drop table tmp purge; + +ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS; ---- Sprint 4 ---- -create table indi_pub_diamond stored as parquet as +create table if not exists indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd left outer join ( @@ -212,21 +184,9 @@ from publication_datasources pd and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; -compute stats indi_pub_diamond; +ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS; ---create table indi_pub_hybrid stored as parquet as ---select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid ---from publication_datasources pd --- left outer join ( --- select pd.id, 1 as is_hybrid from publication_datasources pd --- join datasource d on d.id=pd.datasource --- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) --- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp --- on pd.id=tmp.id; --- ---compute stats indi_pub_hybrid; - -create table indi_pub_in_transformative stored as parquet as +create table if not exists indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd left outer join ( @@ -236,9 +196,9 @@ from publication pd and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; -compute stats indi_pub_in_transformative; +ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS; -create table indi_pub_closed_other_open stored as parquet as +create table if not exists indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri left outer join (select ri.id, 1 as pub_closed_other_open from result_instance ri @@ -248,180 +208,16 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; -compute stats indi_pub_closed_other_open; +ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- -create table indi_result_no_of_copies stored as parquet as +create table if not exists indi_result_no_of_copies stored as parquet as select id, count(id) as number_of_copies from result_instance group by id; -compute stats indi_result_no_of_copies; +ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- ---create table indi_pub_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION ALL --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * FROM --- (SELECT id, issn_printed as issn --- FROM datasource WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource WHERE issn_online IS NOT NULL) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM publication_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM publication_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - ---compute stats indi_pub_gold_oa; --- ---create table indi_datasets_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM dataset_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_datasets_gold_oa; - ---create table indi_software_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM software_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM software_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_software_gold_oa; - ---create table indi_org_findable stored as parquet as ---with result_with_pid as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro --- join result_pids rp on rp.id=ro.id --- group by ro.organization), ---result_has_abstract as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro --- join result rp on rp.id=ro.id where rp.abstract=true --- group by ro.organization), ---allresults as ( --- select organization, count(distinct id) no_allresults from result_organization --- group by organization), ---result_with_pid_share as ( --- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share --- from allresults --- join result_with_pid on result_with_pid.organization=allresults.organization), ---result_with_abstract_share as ( --- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share --- from allresults --- join result_has_abstract on result_has_abstract.organization=allresults.organization) ---select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable ---from allresults ---join result_with_pid_share on result_with_pid_share.organization=allresults.organization ---left outer join ( --- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; --- ---compute stats indi_org_findable; --- ---create table indi_org_openess stored as parquet as ---WITH datasets_oa as ( --- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join dataset ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---software_oa as ( --- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join software ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---pubs_oa as ( --- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join publication ds on dg.id=ds.id --- where dg.is_gold=1 --- group by ro.organization), ---allpubs as ( --- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro --- join publication ps on ps.id=ro.id --- group by ro.organization), ---alldatasets as ( --- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro --- join dataset ps on ps.id=ro.id --- group by ro.organization), ---allsoftware as ( --- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro --- join software ps on ps.id=ro.id --- group by ro.organization), ---allpubsshare as ( --- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs --- join pubs_oa on allpubs.organization=pubs_oa.organization), ---alldatasetssshare as ( --- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c --- from alldatasets --- join datasets_oa on alldatasets.organization=datasets_oa.organization), ---allsoftwaresshare as ( --- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s --- from allsoftware --- join software_oa on allsoftware.organization=software_oa.organization) ---select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess ---FROM allpubsshare ---left outer join ( --- select organization,c from --- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization ---left outer join ( --- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; --- ---compute stats indi_org_openess; --- -create table indi_pub_hybrid_oa_with_cc stored as parquet as +create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn FROM stats_ext.plan_s_jn @@ -436,7 +232,7 @@ create table indi_pub_hybrid_oa_with_cc stored as parquet as SELECT id, issn_printed as issn FROM datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id,issn_online as issn FROM datasource WHERE issn_online IS NOT NULL ) as issn @@ -451,45 +247,44 @@ FROM publication_datasources pd JOIN indi_result_has_cc_licence cc on pd.id=cc.id where cc.has_cc_license=1) tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -create table indi_pub_downloads stored as parquet as +create table if not exists indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id order by no_downloads desc; -compute stats indi_pub_downloads; +ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS; -create table indi_pub_downloads_datasource stored as parquet as +create table if not exists indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; -compute stats indi_pub_downloads_datasource; +ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS; -create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id where downloads>0 -GROUP BY result_id, `year` -order by `year` asc; +create table if not exists indi_pub_downloads_year stored as parquet as +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads +from openaire_prod_usage_stats.usage_stats us +join publication on result_id=id where downloads>0 +GROUP BY result_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_year; +ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS; -create table indi_pub_downloads_datasource_year stored as parquet as +create table if not exists indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id +join publication on result_id=id where downloads>0 -GROUP BY result_id, repository_id, `year` -order by `year` asc, result_id; +GROUP BY result_id, repository_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_datasource_year; +ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- -create table indi_pub_gold_oa stored as parquet as +create table if not exists indi_pub_gold_oa stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -518,7 +313,7 @@ create table indi_pub_gold_oa stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -538,9 +333,9 @@ FROM JOIN gold_oa on issn.issn = gold_oa.issn) tmp on pd.id=tmp.id; -compute stats indi_pub_gold_oa; +ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; -create table indi_pub_hybrid stored as parquet as +create table if not exists indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -571,7 +366,7 @@ create table indi_pub_hybrid stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -591,15 +386,15 @@ from publication_datasources pd where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid; +ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS; -create table indi_org_fairness stored as parquet as +create table if not exists indi_org_fairness stored as parquet as --return results with PIDs, and rich metadata group by organization with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id --join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -611,16 +406,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness; +ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS; -create table indi_org_fairness_pub_pr stored as parquet as +create table if not exists indi_org_fairness_pub_pr stored as parquet as with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join publication p on p.id=ro.id join indi_pub_doi_from_crossref dc on dc.id=p.id join indi_pub_grey_lit gl on gl.id=p.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 group by ro.organization), allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -632,150 +427,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub_pr; +ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS; -create table indi_org_fairness_pub_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id +CREATE TEMPORARY table result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id where cast(year as int)>2003 - group by organization, year) + group by organization, year; + +create table if not exists indi_org_fairness_pub_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -compute stats indi_org_fairness_pub_year; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_pub as -with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) - and (authors>0) and cast(year as int)>2003 - group by ro.organization), - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id - where cast(year as int)>2003 - group by organization) +ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) + and (authors>0) and cast(year as int)>2003 + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as + select organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id + where cast(year as int)>2003 + group by organization; + +create table if not exists indi_org_fairness_pub as select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization; +from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro +ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int)>2003 - group by organization, year) ---return results_fair/all_results -select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + group by organization, year; -compute stats indi_org_fairness_year; +create table if not exists indi_org_fairness_year stored as parquet as + select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -create table indi_org_findable_year stored as parquet as ---return results with PIDs group by organization,year - with result_with_pid as - (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +DROP table result_fair purge; +DROP table allresults purge; + +ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as + select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization, year), ---return all results group by organization,year - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization, year) ---return results_with_pid/all_results + group by organization, year; + +create table if not exists indi_org_findable_year stored as parquet as select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; -compute stats indi_org_findable_year; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_findable stored as parquet as ---return results with PIDs group by organization - with result_with_pid as - (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as +select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization), ---return all results group by organization - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as +select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization) ---return results_with_pid/all_results + group by organization; + +create table if not exists indi_org_findable stored as parquet as select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; -compute stats indi_org_findable; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_openess stored as parquet as - WITH pubs_oa as ( - SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_findable COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa as +SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - datasets_oa as ( - SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization; + +CREATE TEMPORARY TABLE datasets_oa as +SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - software_oa as ( - SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization; + +CREATE TEMPORARY TABLE software_oa as +SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - allpubs as ( - SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allpubs as +SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - alldatasets as ( - SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE alldatasets as +SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allsoftware as ( - SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allsoftware as +SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allpubsshare as ( - select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization), - alldatasetssshare as ( - select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization; + +CREATE TEMPORARY TABLE allpubsshare as +select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization; + +CREATE TEMPORARY TABLE alldatasetssshare as +select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization), - allsoftwaresshare as ( - select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization; + +CREATE TEMPORARY TABLE allsoftwaresshare as +select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization) + join software_oa on allsoftware.organization=software_oa.organization; + +create table if not exists indi_org_openess stored as parquet as select allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select organization,d from @@ -785,55 +610,75 @@ select allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization; -compute stats indi_org_openess; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_org_openess_year stored as parquet as - WITH pubs_oa as ( - SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_openess COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa AS +SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization,r.year), - datasets_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization,r.year; + +CREATE TEMPORARY TABLE datasets_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - software_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE software_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - allpubs as ( - SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE allpubs as +SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication p on p.id=ro.id where cast(p.year as int)>2003 - group by ro.organization, p.year), - alldatasets as ( - SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization, p.year; + +CREATE TEMPORARY TABLE alldatasets as +SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset d on d.id=ro.id where cast(d.year as int)>2003 - group by ro.organization, d.year), - allsoftware as ( - SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization, d.year; + +CREATE TEMPORARY TABLE allsoftware as +SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software s on s.id=ro.id where cast(s.year as int)>2003 - group by ro.organization, s.year), - allpubsshare as ( - select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), - alldatasetssshare as ( - select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization, s.year; + +CREATE TEMPORARY TABLE allpubsshare as +select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); + +CREATE TEMPORARY TABLE alldatasetssshare as +select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), - allsoftwaresshare as ( - select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); + +CREATE TEMPORARY TABLE allsoftwaresshare as +select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) + join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); + + +create table if not exists indi_org_openess_year stored as parquet as select allpubsshare.year, allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select year, organization,d from @@ -843,9 +688,19 @@ select allpubsshare.year, allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; -compute stats indi_org_openess_year; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_pub_has_preprint stored as parquet as +ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS; + +create table if not exists indi_pub_has_preprint stored as parquet as select distinct p.id, coalesce(has_preprint, 0) as has_preprint from publication_classifications p left outer join ( @@ -854,9 +709,9 @@ from publication_classifications p where p.type='Preprint') tmp on p.id= tmp.id; -compute stats indi_pub_has_preprint; +ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS; -create table indi_pub_in_subscribed stored as parquet as +create table if not exists indi_pub_in_subscribed stored as parquet as select distinct p.id, coalesce(is_subscription, 0) as is_subscription from publication p left outer join( @@ -867,9 +722,9 @@ from publication p where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp on p.id=tmp.id; -compute stats indi_pub_in_subscribed; +ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS; -create table indi_result_with_pid as +create table if not exists indi_result_with_pid as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from result p left outer join ( @@ -877,4 +732,4 @@ from result p from result_pids p) tmp on p.id= tmp.id; -compute stats indi_result_with_pid; \ No newline at end of file +ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 98dca7129..195836480 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -10,6 +10,11 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_ create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create table TARGET.result stored as parquet as select distinct * from ( @@ -54,84 +59,87 @@ create table TARGET.result stored as parquet as 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje - 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork ) )) foo; -compute stats TARGET.result; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations; +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_references_oc; +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations_oc; +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_classifications; +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_apc; +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_concepts; +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_datasources; +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fundercount; +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_gold; +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_greenoa; +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_languages; +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licenses; +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_oids; +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_organization; +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_peerreviewed; +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_pids; +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projectcount; +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projects; +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_refereed; +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_sources; +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_topics; +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fos; +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -compute stats TARGET.result_result; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -140,7 +148,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -compute stats TARGET.datasource_results; +ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -157,28 +165,28 @@ create view if not exists TARGET.project_resultcount as select * from SOURCE.pro create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -compute stats TARGET.project_results; +ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence; +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence_url; +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_abstract; +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_with_orcid; +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_funded_result_with_fundref; +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -187,30 +195,30 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_diamond; +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_in_transformative; +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_closed_other_open; +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_no_of_copies; +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads; +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_year; +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource_year; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid; +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -221,11 +229,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); - +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; --create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_datasets_gold_oa; --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e24370e7d..2d7d572b3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,6 +8,8 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; +ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, @@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, @@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, @@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, @@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, @@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, @@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, @@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, @@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, @@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, @@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, @@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, @@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, @@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, @@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, @@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, @@ -558,4 +594,6 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 131f96df9..248716b36 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -96,6 +96,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 81da11903..9976b8455 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -74,7 +74,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -302,22 +302,22 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${jobTracker} - ${nameNode} - finalizedb.sh - ${stats_db_name} - ${stats_db_shadow_name} - finalizedb.sh - + + + + + + + + + + + + + @@ -355,67 +355,67 @@ stats_db_name=${stats_db_name} observatory_db_name=${observatory_db_name} - - - - - - - ${jobTracker} - ${nameNode} - observatory-post.sh - ${stats_db_name} - ${observatory_db_name} - ${observatory_db_shadow_name} - observatory-post.sh - - - - - - - - ${jobTracker} - ${nameNode} - copyDataToImpalaCluster.sh - ${external_stats_db_name} - ${stats_db_name} - ${monitor_db_name} - ${observatory_db_name} - copyDataToImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - finalizeImpalaCluster.sh - ${stats_db_name} - ${stats_db_shadow_name} - ${monitor_db_name} - ${monitor_db_shadow_name} - ${observatory_db_name} - ${observatory_db_shadow_name} - finalizeImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - updateCache.sh - ${stats_tool_api_url} - updateCache.sh - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 43f6d4f2965204a760bef5db453285aaf90c818e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 11:26:47 +0200 Subject: [PATCH 14/47] -Monitor DB workflow --- .../monitor/oozie_app/config-default.xml | 34 ++++++ .../graph/monitor/oozie_app/monitor-post.sh | 21 ++++ .../dhp/oa/graph/monitor/oozie_app/monitor.sh | 24 ++++ .../oa/graph/monitor/oozie_app/workflow.xml | 105 ++++++++++++++++++ 4 files changed, 184 insertions(+) create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml new file mode 100644 index 000000000..63fc84d75 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + stats_tool_api_url + ${stats_tool_api_url} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh new file mode 100644 index 000000000..b8c71681a --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh @@ -0,0 +1,21 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow monitor database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh new file mode 100644 index 000000000..f39bf4893 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -0,0 +1,24 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH + +echo "Creating monitor database" +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml new file mode 100644 index 000000000..7e4cfc759 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + stats_db_name + the target stats database name + + + stats_db_shadow_name + the name of the shadow schema + + + monitor_db_name + the target monitor db name + + + monitor_db_shadow_name + the name of the shadow monitor db + + + stats_tool_api_url + The url of the API of the stats tool. Is used to trigger the cache update. + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + monitor.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/createMonitorDB.sql + monitor.sh + + + + + + + + + ${jobTracker} + ${nameNode} + monitor-post.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + monitor-post.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + + + + + + + \ No newline at end of file From 34d4bf727ce668c08cb658d87229ca1bc95d6704 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 11:28:37 +0200 Subject: [PATCH 15/47] Bug fixes --- .../graph/monitor/oozie_app/monitor-post.sh | 16 +- .../oozie_app/scripts/createMonitorDB.sql | 149 ++++++++++++++++++ .../oa/graph/monitor/oozie_app/workflow.xml | 12 -- 3 files changed, 157 insertions(+), 20 deletions(-) create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh index b8c71681a..dd82310e0 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh @@ -11,11 +11,11 @@ export TARGET=$2 export SHADOW=$3 impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - -echo "Impala shell finished" - -echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file +#impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +#echo "Impala shell finished" +# +#echo "Updating shadow monitor database" +#impala-shell -q "create database if not exists ${SHADOW}" +#impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +#impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +#echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql new file mode 100644 index 000000000..7a27b0ea5 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -0,0 +1,149 @@ +DROP TABLE IF EXISTS TARGET.result_new; + +create table TARGET.result_new stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( +-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + ) )) foo; + +ANALYZE TABLE TARGET.result_new COMPUTE STATISTICS; + +INSERT INTO TARGET.result select * from TARGET.result_new; + +INSERT INTO TARGET.result_citations select * from TARGET.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; + +INSERT INTO TARGET.result_references_oc select * from TARGET.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_citations_oc select * from TARGET.result_citations_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_classifications select * from TARGET.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; + +INSERT INTO TARGET.result_apc select * from TARGET.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_concepts select * from TARGET.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; + +INSERT INTO TARGET.result_datasources select * from TARGET.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fundercount select * from TARGET.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_gold select * from TARGET.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; + +INSERT INTO TARGET.result_greenoa select * from TARGET.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; + +INSERT INTO TARGET.result_languages select * from TARGET.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; + +INSERT INTO TARGET.result_licenses select * from TARGET.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; + +INSERT INTO TARGET.result_oids select * from TARGET.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_organization select * from TARGET.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; + +INSERT INTO TARGET.result_peerreviewed select * from TARGET.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_pids select * from TARGET.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projectcount select * from TARGET.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projects select * from TARGET.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; + +INSERT INTO TARGET.result_refereed select * from TARGET.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_sources select * from TARGET.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_topics select * from TARGET.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +create view TARGET.foo1 as select * from TARGET.result_result rr where rr.source in (select id from TARGET.result_new); +create view TARGET.foo2 as select * from TARGET.result_result rr where rr.target in (select id from TARGET.result_new); +INSERT INTO TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; + + +-- indicators +-- Sprint 1 ---- +INSERT INTO TARGET.indi_pub_green_oa select * from TARGET.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_grey_lit select * from TARGET.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_doi_from_crossref select * from TARGET.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +-- Sprint 2 ---- +INSERT INTO TARGET.indi_result_has_cc_licence select * from TARGET.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_has_cc_licence_url select * from TARGET.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_has_abstract select * from TARGET.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_orcid select * from TARGET.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +---- Sprint 3 ---- +INSERT INTO TARGET.indi_funded_result_with_fundref select * from TARGET.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; +---- Sprint 4 ---- +INSERT INTO TARGET.indi_pub_diamond select * from TARGET.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_transformative select * from TARGET.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_closed_other_open select * from TARGET.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +---- Sprint 5 ---- +INSERT INTO TARGET.indi_result_no_of_copies select * from TARGET.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +---- Sprint 6 ---- +INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from TARGET.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads select * from TARGET.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource select * from TARGET.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_year select * from TARGET.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from TARGET.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +---- Sprint 7 ---- +INSERT INTO TARGET.indi_pub_gold_oa select * from TARGET.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_hybrid select * from TARGET.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; + +INSERT INTO TARGET.indi_pub_has_preprint select * from TARGET.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_subscribed select * from TARGET.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_pid select * from TARGET.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_datasets_gold_oa; +--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_software_gold_oa; +DROP TABLE TARGET.result_new; diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml index 7e4cfc759..ab51931b6 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -89,17 +89,5 @@ - - - ${jobTracker} - ${nameNode} - updateCache.sh - ${stats_tool_api_url} - updateCache.sh - - - - - \ No newline at end of file From 4d7553c9f1df96349a00f682d2348423b8a28db8 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 17:19:19 +0200 Subject: [PATCH 16/47] Bug fixes --- .../oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index 7a27b0ea5..63085e7a1 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -2,10 +2,6 @@ DROP TABLE IF EXISTS TARGET.result_new; create table TARGET.result_new stored as parquet as select distinct * from ( - select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) - union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) - union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( -- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University From cf58e4a5e47d631811c6459241cdc03bea46684f Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 25 Jan 2023 16:03:16 +0200 Subject: [PATCH 17/47] =?UTF-8?q?Added=20Arts=20et=20M=C3=A9tiers=20ParisT?= =?UTF-8?q?ech?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 195836480..6c0dd92f0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -61,6 +61,8 @@ create table TARGET.result stored as parquet as 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From 2dc6d47270072c061b4cb36a070bd5dd29ad8a8d Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 6 Feb 2023 13:18:53 +0200 Subject: [PATCH 18/47] Changes 06022023 --- .../oozie_app/copyDataToImpalaCluster.sh | 10 +- .../stats/oozie_app/finalizeImpalaCluster.sh | 5 +- .../oa/graph/stats/oozie_app/finalizedb.sh | 7 +- .../oa/graph/stats/oozie_app/indicators.sh | 2 +- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 5 +- .../graph/stats/oozie_app/observatory-post.sh | 7 +- .../graph/stats/oozie_app/observatory-pre.sh | 5 +- .../scripts/step20-createMonitorDB.sql | 6 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 155 +++++++++++------- 9 files changed, 127 insertions(+), 75 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 9846eb66a..4a3010867 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,13 +6,16 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi +#export HADOOP_USER_NAME="dimitris.pierrakos" +export HADOOP_USER_NAME=$4 + function copydb() { db=$1 # copy the databases from ocean to impala #echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp # change ownership to impala hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db @@ -48,9 +51,10 @@ function copydb() { STATS_DB=$1 MONITOR_DB=$2 OBSERVATORY_DB=$3 -EXT_DB=$4 +HADOOP_USER_NAME=$4 +#EXT_DB=$4 -copydb $EXT_DB +#copydb $EXT_DB copydb $STATS_DB copydb $MONITOR_DB copydb $OBSERVATORY_DB diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh index 31107c7ed..fedfa00af 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -10,9 +10,10 @@ function createShadowDB() { SOURCE=$1 SHADOW=$2 + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE"; impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - - impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - } STATS_DB=$1 diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 011cfcc28..5863625a1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -12,5 +12,8 @@ export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark export HADOOP_USER_NAME="oozie" echo "Updating shadow database" -hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive $HIVE_OPTS -f foo \ No newline at end of file +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 6c76e35f2..473864315 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,7 +8,7 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false" export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 25095f4d3..db0aa3485 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -17,9 +17,8 @@ export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH - echo "Creating monitor database" #cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo -cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo hive $HIVE_OPTS -f foo -echo "Hive shell finished" +echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index fafafe59a..a7412ceee 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -10,6 +10,9 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo -hive -f foo +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo echo "Hive shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index be009cd45..37671cce8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -10,8 +10,11 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + echo "Creating observatory database" hive -e "drop database if exists ${TARGET} cascade" hive -e "create database if not exists ${TARGET}" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo hive -f foo diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 6c0dd92f0..b545ee637 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -60,9 +60,9 @@ create table TARGET.result stored as parquet as 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan - 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University +-- 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 9976b8455..80a353cc9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + stats_db_name @@ -53,6 +53,10 @@ context_api_url the base url of the context api (https://services.openaire.eu/openaire) + + hadoop_user_name + user name of the wf owner + @@ -74,7 +78,40 @@ - + + + + + ${wf:conf('resumeFrom') eq 'Step1'} + ${wf:conf('resumeFrom') eq 'Step2'} + ${wf:conf('resumeFrom') eq 'Step3'} + ${wf:conf('resumeFrom') eq 'Step4'} + ${wf:conf('resumeFrom') eq 'Step5'} + ${wf:conf('resumeFrom') eq 'Step6'} + ${wf:conf('resumeFrom') eq 'Step7'} + ${wf:conf('resumeFrom') eq 'Step8'} + ${wf:conf('resumeFrom') eq 'Step9'} + ${wf:conf('resumeFrom') eq 'Step10'} + ${wf:conf('resumeFrom') eq 'Step11'} + ${wf:conf('resumeFrom') eq 'Step12'} + ${wf:conf('resumeFrom') eq 'Step13'} + ${wf:conf('resumeFrom') eq 'Step14'} + ${wf:conf('resumeFrom') eq 'Step15'} + ${wf:conf('resumeFrom') eq 'Step15_5'} + ${wf:conf('resumeFrom') eq 'Contexts'} + ${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'} + ${wf:conf('resumeFrom') eq 'Step16_1-definitions'} + ${wf:conf('resumeFrom') eq 'Step16_5'} + ${wf:conf('resumeFrom') eq 'Step19-finalize'} + ${wf:conf('resumeFrom') eq 'step20-createMonitorDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} + ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -302,22 +339,22 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${jobTracker} + ${nameNode} + finalizedb.sh + ${stats_db_name} + ${stats_db_shadow_name} + finalizedb.sh + - - - - - - - - - - - - - @@ -355,55 +392,57 @@ stats_db_name=${stats_db_name} observatory_db_name=${observatory_db_name} - + - - - - - - - - - - - - - + + + ${jobTracker} + ${nameNode} + observatory-post.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + observatory-post.sh + + + + - - - - - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + - - - - - - - - + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + - - - - - - - - - - - - - - - - + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + finalizeImpalaCluster.sh + + + + From 3ba11d64a18a936d3f82bf363498561091add89e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 7 Feb 2023 12:53:51 +0200 Subject: [PATCH 19/47] Changes 07022023 --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index b545ee637..02c798a8d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -62,7 +62,8 @@ create table TARGET.result stored as parquet as 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University --- 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 80a353cc9..0ce429145 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -79,7 +79,6 @@ - ${wf:conf('resumeFrom') eq 'Step1'} From 35ba8bb32873147291c959da21422eddfa91f3f6 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 9 Feb 2023 12:57:57 +0200 Subject: [PATCH 20/47] Bug fixes --- .../dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 4a3010867..b5209b04d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -28,13 +28,13 @@ function copydb() { for i in `impala-shell -d ${db} --delimited -q "show tables"`; do impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - # run the same command twice because we may have failures in the first run (due to views pointing to the same db) for i in `impala-shell -d ${db} --delimited -q "show tables"`; do impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - # load the data from /tmp in the respective tables echo "copying data in tables and computing stats" From d71f5672d319f89e33e43693a5f109a5c5994a2f Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 9 Feb 2023 13:44:14 +0200 Subject: [PATCH 21/47] Add monitor post step --- .../oa/graph/stats/oozie_app/monitor-post.sh | 19 +++++++++++++++++++ .../dhp/oa/graph/stats/oozie_app/monitor.sh | 3 ++- .../graph/stats/oozie_app/observatory-post.sh | 13 +++++++------ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 15 ++++++++++++++- 4 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh new file mode 100644 index 000000000..5863625a1 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh @@ -0,0 +1,19 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index db0aa3485..440aac770 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -21,4 +21,5 @@ echo "Creating monitor database" #cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo hive $HIVE_OPTS -f foo -echo "Hive shell finished" \ No newline at end of file +echo "Hive shell finished" + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index a7412ceee..5863625a1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -7,12 +7,13 @@ then fi export SOURCE=$1 -export TARGET=$2 -export SHADOW=$3 - +export SHADOW=$2 export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" export HADOOP_USER_NAME="oozie" -hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo -hive $HIVE_OPTS -f foo -echo "Hive shell finished" +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 0ce429145..5d2e3799e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -366,6 +366,20 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh + + + + + + + ${jobTracker} + ${nameNode} + monitor-post.sh + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/step20-createMonitorDB.sql + monitor.sh + @@ -400,7 +414,6 @@ ${jobTracker} ${nameNode} observatory-post.sh - ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} observatory-post.sh From 7b78b15c81eaa19f57132831a8d027ee78790eac Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 13 Feb 2023 09:27:00 +0200 Subject: [PATCH 22/47] Changes for copying to Impala Cluster --- .../dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh | 1 + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index b5209b04d..a2424a351 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -24,6 +24,7 @@ function copydb() { impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + impala-shell -q "INVALIDATE METADATA" echo "creating schema for ${db}" for i in `impala-shell -d ${db} --delimited -q "show tables"`; do diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 5d2e3799e..558b9544f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -103,6 +103,7 @@ ${wf:conf('resumeFrom') eq 'Step16_5'} ${wf:conf('resumeFrom') eq 'Step19-finalize'} ${wf:conf('resumeFrom') eq 'step20-createMonitorDB'} + ${wf:conf('resumeFrom') eq 'step20-createMonitorDB-post'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} @@ -377,8 +378,7 @@ monitor-post.sh ${monitor_db_name} ${monitor_db_shadow_name} - ${wf:appPath()}/scripts/step20-createMonitorDB.sql - monitor.sh + monitor-post.sh From 935db0ab256bfb70c66b38b97c925cbc80443a7f Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 13 Feb 2023 09:29:09 +0200 Subject: [PATCH 23/47] Added organizations for Monitor --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 02c798a8d..0f4bb1330 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -63,7 +63,10 @@ create table TARGET.result stored as parquet as 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech - 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From 3400133c2f21b6809dc961c4cf411b6e06680795 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 13 Feb 2023 09:44:00 +0200 Subject: [PATCH 24/47] Bug fix --- .../oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh | 8 ++++---- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index a2424a351..5b6752398 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -7,7 +7,7 @@ then fi #export HADOOP_USER_NAME="dimitris.pierrakos" -export HADOOP_USER_NAME=$4 +export HADOOP_USER_NAME=$5 function copydb() { db=$1 @@ -52,10 +52,10 @@ function copydb() { STATS_DB=$1 MONITOR_DB=$2 OBSERVATORY_DB=$3 -HADOOP_USER_NAME=$4 -#EXT_DB=$4 +EXT_DB=$4 +HADOOP_USER_NAME=$5 -#copydb $EXT_DB +copydb $EXT_DB copydb $STATS_DB copydb $MONITOR_DB copydb $OBSERVATORY_DB diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 558b9544f..31a7a17c5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -432,6 +432,7 @@ ${stats_db_name} ${monitor_db_name} ${observatory_db_name} + ${stats_ext} ${hadoop_user_name} copyDataToImpalaCluster.sh From f3aaff36884d68746f0d2b6a05370468c74aa4c0 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 14 Feb 2023 09:48:36 +0200 Subject: [PATCH 25/47] Remove duplicate orgs --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 --- 1 file changed, 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 0f4bb1330..628af519a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -63,9 +63,6 @@ create table TARGET.result stored as parquet as 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech - 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town ) )) foo; From 595192d5105adf54e7034b86ec56dfd2d8c87d4c Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 14 Feb 2023 16:24:08 +0200 Subject: [PATCH 26/47] Bug fix --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 31a7a17c5..e23bd0aa3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -10,7 +10,6 @@ external_stats_db_name - stats_ext the external stats that should be added since they are not included in the graph database @@ -432,7 +431,7 @@ ${stats_db_name} ${monitor_db_name} ${observatory_db_name} - ${stats_ext} + ${external_stats_db_name} ${hadoop_user_name} copyDataToImpalaCluster.sh From 032a401cbf930304affe67eb533bb4402ba8eebe Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 20 Feb 2023 09:29:20 +0200 Subject: [PATCH 27/47] Bug fixes --- .../oozie_app/copyDataToImpalaCluster.sh | 2 +- .../oa/graph/stats/oozie_app/indicators.sh | 6 +++-- .../stats/oozie_app/scripts/step15_5.sql | 16 +++++------ .../scripts/step16-createIndicatorsTables.sql | 16 +++++------ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 27 ++++++++++--------- 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 5b6752398..843877c90 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -14,7 +14,7 @@ function copydb() { # copy the databases from ocean to impala - #echo "copying $db" + echo "copying $db" hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp # change ownership to impala diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 473864315..2f1eefa0c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -7,7 +7,9 @@ then fi export TARGET=$1 -export SCRIPT_PATH=$2 +export STATS_EXT=$2 +export SCRIPT_PATH=$3 + export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false" export HADOOP_USER_NAME="oozie" @@ -15,7 +17,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive $HIVE_OPTS -f foo hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 1ae856355..61c0726ff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -29,17 +29,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; -create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; -create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; -create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; -create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; +create view ${stats_db_name}.rndexpenditure as select * from ${external_stats_db_name}.rndexpediture; +create view ${stats_db_name}.rndgdpexpenditure as select * from ${external_stats_db_name}.rndgdpexpenditure; +create view ${stats_db_name}.doctoratestudents as select * from ${external_stats_db_name}.doctoratestudents; +create view ${stats_db_name}.totalresearchers as select * from ${external_stats_db_name}.totalresearchers; +create view ${stats_db_name}.totalresearchersft as select * from ${external_stats_db_name}.totalresearchersft; +create view ${stats_db_name}.hrrst as select * from ${external_stats_db_name}.hrrst; create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( - select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, + select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; @@ -52,4 +52,4 @@ from ( join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index ac4d4202a..4fd941e5d 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -180,7 +180,7 @@ from publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; @@ -192,7 +192,7 @@ from publication pd left outer join ( select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; @@ -220,11 +220,11 @@ ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_print != "" UNION ALL SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), issn AS ( SELECT * @@ -291,7 +291,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as journal_is_oa, issn_1 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -301,7 +301,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as journal_is_oa, issn_2 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -343,7 +343,7 @@ create table if not exists indi_pub_hybrid stored as parquet as issn_1 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -354,7 +354,7 @@ create table if not exists indi_pub_hybrid stored as parquet as issn_2 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index e23bd0aa3..e9453d7b1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -108,6 +108,7 @@ ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step24-updateCache'} @@ -289,6 +290,7 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} @@ -313,6 +315,7 @@ ${nameNode} indicators.sh ${stats_db_name} + ${external_stats_db_name} ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh @@ -452,21 +455,21 @@ ${observatory_db_shadow_name} finalizeImpalaCluster.sh + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + - - - - - - - - - - - - From d2f9ccf9347a9828588624d109bb453795b526e5 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 20 Feb 2023 10:41:21 +0200 Subject: [PATCH 28/47] Changes to separate monitor wf --- .../oozie_app/copyDataToImpalaCluster.sh | 62 +++++++++++++++++++ .../oozie_app/finalizeImpalaCluster.sh | 28 +++++++++ .../oa/graph/monitor/oozie_app/updateCache.sh | 4 ++ 3 files changed, 94 insertions(+) create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..843877c90 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,62 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +#export HADOOP_USER_NAME="dimitris.pierrakos" +export HADOOP_USER_NAME=$5 + +function copydb() { + db=$1 + + # copy the databases from ocean to impala + + echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp + + # change ownership to impala + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + impala-shell -q "INVALIDATE METADATA" + echo "creating schema for ${db}" + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # run the same command twice because we may have failures in the first run (due to views pointing to the same db) + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db +} + +STATS_DB=$1 +MONITOR_DB=$2 +OBSERVATORY_DB=$3 +EXT_DB=$4 +HADOOP_USER_NAME=$5 + +copydb $EXT_DB +copydb $STATS_DB +copydb $MONITOR_DB +copydb $OBSERVATORY_DB + diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..fedfa00af --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createShadowDB() { + SOURCE=$1 + SHADOW=$2 + + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; +# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +} + +STATS_DB=$1 +STATS_DB_SHADOW=$2 +MONITOR_DB=$3 +MONITOR_DB_SHADOW=$4 +OBSERVATORY_DB=$5 +OBSERVATORY_DB_SHADOW=$6 + +createShadowDB $STATS_DB $STATS_DB_SHADOW +createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW +createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh new file mode 100644 index 000000000..03aa535e1 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +curl --request GET $1/cache/updateCache +sleep 6h \ No newline at end of file From 90807b60c7f71a944dde611862cdabc4266c1646 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 20 Feb 2023 10:42:24 +0200 Subject: [PATCH 29/47] Changes to monitor wf --- .../oozie_app/copyDataToImpalaCluster.sh | 12 ++---- .../oozie_app/finalizeImpalaCluster.sh | 6 --- .../graph/monitor/oozie_app/monitor-post.sh | 20 ++++----- .../dhp/oa/graph/monitor/oozie_app/monitor.sh | 3 +- .../oa/graph/monitor/oozie_app/workflow.xml | 42 ++++++++++++++++++- 5 files changed, 53 insertions(+), 30 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh index 843877c90..c05d8342a 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh @@ -7,7 +7,7 @@ then fi #export HADOOP_USER_NAME="dimitris.pierrakos" -export HADOOP_USER_NAME=$5 +export HADOOP_USER_NAME=$2 function copydb() { db=$1 @@ -49,14 +49,8 @@ function copydb() { hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db } -STATS_DB=$1 -MONITOR_DB=$2 -OBSERVATORY_DB=$3 -EXT_DB=$4 -HADOOP_USER_NAME=$5 +MONITOR_DB=$1 +HADOOP_USER_NAME=$2 -copydb $EXT_DB -copydb $STATS_DB copydb $MONITOR_DB -copydb $OBSERVATORY_DB diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh index fedfa00af..46eaba6d0 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh @@ -16,13 +16,7 @@ function createShadowDB() { impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - } -STATS_DB=$1 -STATS_DB_SHADOW=$2 MONITOR_DB=$3 MONITOR_DB_SHADOW=$4 -OBSERVATORY_DB=$5 -OBSERVATORY_DB_SHADOW=$6 -createShadowDB $STATS_DB $STATS_DB_SHADOW createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW -createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh index dd82310e0..5863625a1 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh @@ -7,15 +7,13 @@ then fi export SOURCE=$1 -export TARGET=$2 -export SHADOW=$3 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" -impala-shell -q "invalidate metadata;" -#impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - -#echo "Impala shell finished" -# -#echo "Updating shadow monitor database" -#impala-shell -q "create database if not exists ${SHADOW}" -#impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -#impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -#echo "Shadow db ready!" \ No newline at end of file +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh index f39bf4893..5e0f68586 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -8,8 +8,7 @@ fi export SOURCE=$1 export TARGET=$2 -export SHADOW=$3 -export SCRIPT_PATH=$4 +export SCRIPT_PATH=$3 export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" export HADOOP_USER_NAME="oozie" diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml index ab51931b6..f24dcc700 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -66,7 +66,6 @@ monitor.sh ${stats_db_name} ${monitor_db_name} - ${monitor_db_shadow_name} ${wf:appPath()}/scripts/createMonitorDB.sql monitor.sh @@ -85,9 +84,48 @@ ${monitor_db_shadow_name} monitor-post.sh - + + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + + + ${monitor_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${monitor_db_name} + ${monitor_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + + + + \ No newline at end of file From 43b23a9bf30d357148ac414f1898d9e8c10c1189 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 15 Mar 2023 09:57:12 +0200 Subject: [PATCH 30/47] Update step20-createMonitorDB.sql Added Technological University Dublin --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 237f68fae..507eb3f47 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -63,7 +63,8 @@ create table TARGET.result stored as parquet as 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech - 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town + 'openorgs____::d11f981828c485cd23d93f7f24f24db1' -- Technological University Dublin ))) foo; compute stats TARGET.result; From fad7fa4af8510b040f5e3282c6f5ba86550580c1 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 22 Mar 2023 09:44:00 +0200 Subject: [PATCH 31/47] Added Technological University Dublin --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 628af519a..e0ac09387 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -63,7 +63,8 @@ create table TARGET.result stored as parquet as 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech - 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town + 'openorgs____::d11f981828c485cd23d93f7f24f24db1' -- Technological University Dublin ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From 9e1335df4c34230a056ddced875447e29393460a Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 4 Apr 2023 13:22:40 +0300 Subject: [PATCH 32/47] -Added Technological University Dublin -Added project_organization_contribution table --- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 1 + .../dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 507eb3f47..6ea95da7b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -165,6 +165,7 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; compute stats TARGET.project_results; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index c31180c14..e0522e149 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -84,4 +84,12 @@ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; \ No newline at end of file +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; + +CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS +SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +properties[0].value contribution, properties[1].value currency +from ${openaire_db_name}.relation r +LATERAL VIEW explode (r.properties) properties +where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file From 91e18ac7f4f95fdb0a6eef04d99820f1ada542a9 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 6 Apr 2023 10:53:11 +0300 Subject: [PATCH 33/47] Added project_organization_contribution table --- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 1 + .../dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index e0ac09387..0f5fc71d4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -167,6 +167,7 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index c31180c14..e0522e149 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -84,4 +84,12 @@ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; \ No newline at end of file +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; + +CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS +SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +properties[0].value contribution, properties[1].value currency +from ${openaire_db_name}.relation r +LATERAL VIEW explode (r.properties) properties +where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file From 9b41dff33c3f9aa4df0ff32bc1066990ec43d4da Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 7 Apr 2023 09:21:38 +0300 Subject: [PATCH 34/47] Update step20-createMonitorDB.sql Added Delft University of Technology --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 6ea95da7b..e0bd380c4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -62,9 +62,10 @@ create table TARGET.result stored as parquet as 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::57784c9e047e826fefdb1ef816120d92', -- Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town - 'openorgs____::d11f981828c485cd23d93f7f24f24db1' -- Technological University Dublin + 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin + 'openorgs____::5e6bf8962665cdd040341171e5c631d8' -- Delft University of Technology ))) foo; compute stats TARGET.result; From c85de8fa1f4b8a1c3a7aaa3d76fa916527a995d2 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 7 Apr 2023 09:22:59 +0300 Subject: [PATCH 35/47] -Added Technological University Dublin -Added project_organization_contribution table -Add Delft University of Technology --- .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 0f5fc71d4..68e4a2f09 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -65,6 +65,7 @@ create table TARGET.result stored as parquet as 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town 'openorgs____::d11f981828c485cd23d93f7f24f24db1' -- Technological University Dublin + 'openorgs____::5e6bf8962665cdd040341171e5c631d8' -- Delft University of Technology ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From 5247cb71151b34cb6ee13dc04124788996b649c7 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 19 Apr 2023 11:11:19 +0300 Subject: [PATCH 36/47] Bug fix --- .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index a352094df..d73f329e6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -64,7 +64,7 @@ create table TARGET.result stored as parquet as 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town - 'openorgs____::d11f981828c485cd23d93f7f24f24db1' -- Technological University Dublin + 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin 'openorgs____::5e6bf8962665cdd040341171e5c631d8' -- Delft University of Technology ) )) foo; From 4fa750b719f1e443dcb5d32058bfcdb62b53856e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 19 Apr 2023 17:39:53 +0300 Subject: [PATCH 37/47] Bug fixes on monitor-update --- .../oozie_app/scripts/createMonitorDB.sql | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index ccbe8cab8..aed2126af 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -1,6 +1,5 @@ DROP TABLE IF EXISTS TARGET.result_new; -<<<<<<< HEAD create table TARGET.result_new stored as parquet as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( @@ -77,16 +76,6 @@ ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; -======= -create table TARGET.result_new as - select distinct * from ( - select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( --- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork --- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech - ) )) foo; - -COMPUTE STATS TARGET.result_new; INSERT INTO TARGET.result select * from TARGET.result_new; COMPUTE STATS TARGET.result; @@ -155,26 +144,21 @@ INSERT INTO TARGET.result_topics select * from TARGET.result_topics orig where e COMPUTE STATS TARGET.result_topics; INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_fos; ->>>>>>> beta create view TARGET.foo1 as select * from TARGET.result_result rr where rr.source in (select id from TARGET.result_new); create view TARGET.foo2 as select * from TARGET.result_result rr where rr.target in (select id from TARGET.result_new); INSERT INTO TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -<<<<<<< HEAD + ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -======= -COMPUTE STATS TARGET.result_result; ->>>>>>> beta -- indicators -- Sprint 1 ---- INSERT INTO TARGET.indi_pub_green_oa select * from TARGET.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -<<<<<<< HEAD ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; + INSERT INTO TARGET.indi_pub_grey_lit select * from TARGET.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; INSERT INTO TARGET.indi_pub_doi_from_crossref select * from TARGET.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); @@ -275,7 +259,7 @@ INSERT INTO TARGET.indi_pub_in_subscribed select * from TARGET.indi_pub_in_subsc COMPUTE STATS TARGET.indi_pub_in_subscribed; INSERT INTO TARGET.indi_result_with_pid select * from TARGET.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); COMPUTE STATS TARGET.indi_result_with_pid; ->>>>>>> beta + --create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_datasets_gold_oa; --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); From 53ce0230353177e541c2d534e96d07218ae968c5 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Sun, 23 Apr 2023 18:22:26 +0300 Subject: [PATCH 38/47] Bug fixes --- .../eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh | 1 - .../dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql | 1 - .../eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml | 2 -- 3 files changed, 4 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh index 5e0f68586..56681dc62 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -17,7 +17,6 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating monitor database" -#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo hive $HIVE_OPTS -f foo echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index aed2126af..257cb1419 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -3,7 +3,6 @@ DROP TABLE IF EXISTS TARGET.result_new; create table TARGET.result_new stored as parquet as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( --- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University ) )) foo; diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml index f24dcc700..651b6fa13 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -93,8 +93,6 @@ ${jobTracker} ${nameNode} copyDataToImpalaCluster.sh - - ${monitor_db_name} ${hadoop_user_name} copyDataToImpalaCluster.sh From fdb5d2b39fc47de13c36c4c8bac9b8334e9ee73c Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Sun, 23 Apr 2023 18:29:00 +0300 Subject: [PATCH 39/47] Bug fixes --- dhp-workflows/dhp-monitor-update/pom.xml | 32 --- .../monitor/oozie_app/config-default.xml | 34 --- .../oozie_app/copyDataToImpalaCluster.sh | 56 ---- .../oozie_app/finalizeImpalaCluster.sh | 22 -- .../graph/monitor/oozie_app/monitor-post.sh | 19 -- .../dhp/oa/graph/monitor/oozie_app/monitor.sh | 22 -- .../oozie_app/scripts/createMonitorDB.sql | 266 ------------------ .../oa/graph/monitor/oozie_app/updateCache.sh | 4 - .../oa/graph/monitor/oozie_app/workflow.xml | 129 --------- 9 files changed, 584 deletions(-) delete mode 100644 dhp-workflows/dhp-monitor-update/pom.xml delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-monitor-update/pom.xml b/dhp-workflows/dhp-monitor-update/pom.xml deleted file mode 100644 index ca0bb9837..000000000 --- a/dhp-workflows/dhp-monitor-update/pom.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - dhp-workflows - eu.dnetlib.dhp - 1.2.4-SNAPSHOT - - 4.0.0 - dhp-monitor-update - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - - - - pl.project13.maven - git-commit-id-plugin - 2.1.11 - - false - - - - - diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml deleted file mode 100644 index 63fc84d75..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hive_jdbc_url - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - stats_tool_api_url - ${stats_tool_api_url} - - \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh deleted file mode 100644 index c05d8342a..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/copyDataToImpalaCluster.sh +++ /dev/null @@ -1,56 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -#export HADOOP_USER_NAME="dimitris.pierrakos" -export HADOOP_USER_NAME=$2 - -function copydb() { - db=$1 - - # copy the databases from ocean to impala - - echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp - - # change ownership to impala - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db - - # create the databases - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; - - impala-shell -q "INVALIDATE METADATA" - echo "creating schema for ${db}" - for i in `impala-shell -d ${db} --delimited -q "show tables"`; - do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - - - # run the same command twice because we may have failures in the first run (due to views pointing to the same db) - for i in `impala-shell -d ${db} --delimited -q "show tables"`; - do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - - - # load the data from /tmp in the respective tables - echo "copying data in tables and computing stats" - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; - done - - # deleting the remaining directory from hdfs - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db -} - -MONITOR_DB=$1 -HADOOP_USER_NAME=$2 - -copydb $MONITOR_DB - diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh deleted file mode 100644 index 46eaba6d0..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/finalizeImpalaCluster.sh +++ /dev/null @@ -1,22 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -function createShadowDB() { - SOURCE=$1 - SHADOW=$2 - - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE"; - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; -# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - - impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - -} - -MONITOR_DB=$3 -MONITOR_DB_SHADOW=$4 - -createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh deleted file mode 100644 index 5863625a1..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh +++ /dev/null @@ -1,19 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -export SOURCE=$1 -export SHADOW=$2 -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" -export HADOOP_USER_NAME="oozie" - -echo "Updating shadow database" -hive -e "drop database if exists ${SHADOW} cascade" -hive -e "create database if not exists ${SHADOW}" -hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo -hive -f foo -echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh deleted file mode 100644 index 56681dc62..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh +++ /dev/null @@ -1,22 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -export SOURCE=$1 -export TARGET=$2 -export SCRIPT_PATH=$3 - -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" -export HADOOP_USER_NAME="oozie" - -echo "Getting file from " $SCRIPT_PATH -hdfs dfs -copyToLocal $SCRIPT_PATH - -echo "Creating monitor database" -cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo -hive $HIVE_OPTS -f foo -echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql deleted file mode 100644 index 257cb1419..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ /dev/null @@ -1,266 +0,0 @@ -DROP TABLE IF EXISTS TARGET.result_new; - -create table TARGET.result_new stored as parquet as - select distinct * from ( - select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( - 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University - ) )) foo; - -ANALYZE TABLE TARGET.result_new COMPUTE STATISTICS; - -INSERT INTO TARGET.result select * from TARGET.result_new; - -INSERT INTO TARGET.result_citations select * from TARGET.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; - -INSERT INTO TARGET.result_references_oc select * from TARGET.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; - -INSERT INTO TARGET.result_citations_oc select * from TARGET.result_citations_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; - -INSERT INTO TARGET.result_classifications select * from TARGET.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; - -INSERT INTO TARGET.result_apc select * from TARGET.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; - -INSERT INTO TARGET.result_concepts select * from TARGET.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; - -INSERT INTO TARGET.result_datasources select * from TARGET.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; - -INSERT INTO TARGET.result_fundercount select * from TARGET.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; - -INSERT INTO TARGET.result_gold select * from TARGET.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; - -INSERT INTO TARGET.result_greenoa select * from TARGET.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; - -INSERT INTO TARGET.result_languages select * from TARGET.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; - -INSERT INTO TARGET.result_licenses select * from TARGET.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; - -INSERT INTO TARGET.result_oids select * from TARGET.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; - -INSERT INTO TARGET.result_organization select * from TARGET.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; - -INSERT INTO TARGET.result_peerreviewed select * from TARGET.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; - -INSERT INTO TARGET.result_pids select * from TARGET.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; - -INSERT INTO TARGET.result_projectcount select * from TARGET.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; - -INSERT INTO TARGET.result_projects select * from TARGET.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; - -INSERT INTO TARGET.result_refereed select * from TARGET.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; - -INSERT INTO TARGET.result_sources select * from TARGET.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; - -INSERT INTO TARGET.result_topics select * from TARGET.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; - -INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; - -INSERT INTO TARGET.result select * from TARGET.result_new; -COMPUTE STATS TARGET.result; - -INSERT INTO TARGET.result_citations select * from TARGET.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_citations; - -INSERT INTO TARGET.result_references_oc select * from TARGET.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_references_oc; - -INSERT INTO TARGET.result_citations_oc select * from TARGET.result_citations_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_citations_oc; - -INSERT INTO TARGET.result_classifications select * from TARGET.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_classifications; - -INSERT INTO TARGET.result_apc select * from TARGET.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_apc; - -INSERT INTO TARGET.result_concepts select * from TARGET.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_concepts; - -INSERT INTO TARGET.result_datasources select * from TARGET.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_datasources; - -INSERT INTO TARGET.result_fundercount select * from TARGET.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_fundercount; - -INSERT INTO TARGET.result_gold select * from TARGET.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_gold; - -INSERT INTO TARGET.result_greenoa select * from TARGET.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_greenoa; - -INSERT INTO TARGET.result_languages select * from TARGET.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_languages; - -INSERT INTO TARGET.result_licenses select * from TARGET.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_licenses; - -INSERT INTO TARGET.result_oids select * from TARGET.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_oids; - -INSERT INTO TARGET.result_organization select * from TARGET.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_organization; - -INSERT INTO TARGET.result_peerreviewed select * from TARGET.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_peerreviewed; - -INSERT INTO TARGET.result_pids select * from TARGET.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_pids; - -INSERT INTO TARGET.result_projectcount select * from TARGET.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_projectcount; - -INSERT INTO TARGET.result_projects select * from TARGET.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_projects; - -INSERT INTO TARGET.result_refereed select * from TARGET.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_refereed; - -INSERT INTO TARGET.result_sources select * from TARGET.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_sources; - -INSERT INTO TARGET.result_topics select * from TARGET.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.result_topics; - -INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); - -create view TARGET.foo1 as select * from TARGET.result_result rr where rr.source in (select id from TARGET.result_new); -create view TARGET.foo2 as select * from TARGET.result_result rr where rr.target in (select id from TARGET.result_new); -INSERT INTO TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; -drop view TARGET.foo1; -drop view TARGET.foo2; - -ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; - - --- indicators --- Sprint 1 ---- -INSERT INTO TARGET.indi_pub_green_oa select * from TARGET.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; - -INSERT INTO TARGET.indi_pub_grey_lit select * from TARGET.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_doi_from_crossref select * from TARGET.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; --- Sprint 2 ---- -INSERT INTO TARGET.indi_result_has_cc_licence select * from TARGET.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; -INSERT INTO TARGET.indi_result_has_cc_licence_url select * from TARGET.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_has_abstract select * from TARGET.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; -INSERT INTO TARGET.indi_result_with_orcid select * from TARGET.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ----- Sprint 3 ---- -INSERT INTO TARGET.indi_funded_result_with_fundref select * from TARGET.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; ----- Sprint 4 ---- -INSERT INTO TARGET.indi_pub_diamond select * from TARGET.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_in_transformative select * from TARGET.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_closed_other_open select * from TARGET.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ----- Sprint 5 ---- -INSERT INTO TARGET.indi_result_no_of_copies select * from TARGET.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ----- Sprint 6 ---- -INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from TARGET.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_downloads select * from TARGET.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_downloads_datasource select * from TARGET.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_downloads_year select * from TARGET.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from TARGET.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ----- Sprint 7 ---- -INSERT INTO TARGET.indi_pub_gold_oa select * from TARGET.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_hybrid select * from TARGET.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; - -INSERT INTO TARGET.indi_pub_has_preprint select * from TARGET.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; -INSERT INTO TARGET.indi_pub_in_subscribed select * from TARGET.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; -INSERT INTO TARGET.indi_result_with_pid select * from TARGET.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; -======= -COMPUTE STATS TARGET.indi_pub_green_oa; -INSERT INTO TARGET.indi_pub_grey_lit select * from TARGET.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_grey_lit; -INSERT INTO TARGET.indi_pub_doi_from_crossref select * from TARGET.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_doi_from_crossref; --- Sprint 2 ---- -INSERT INTO TARGET.indi_result_has_cc_licence select * from TARGET.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_result_has_cc_licence; -INSERT INTO TARGET.indi_result_has_cc_licence_url select * from TARGET.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_result_has_cc_licence_url; -INSERT INTO TARGET.indi_pub_has_abstract select * from TARGET.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_has_abstract; -INSERT INTO TARGET.indi_result_with_orcid select * from TARGET.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_result_with_orcid; ----- Sprint 3 ---- -INSERT INTO TARGET.indi_funded_result_with_fundref select * from TARGET.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_funded_result_with_fundref; ----- Sprint 4 ---- -INSERT INTO TARGET.indi_pub_diamond select * from TARGET.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_diamond; -INSERT INTO TARGET.indi_pub_in_transformative select * from TARGET.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_in_transformative; -INSERT INTO TARGET.indi_pub_closed_other_open select * from TARGET.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_closed_other_open; ----- Sprint 5 ---- -INSERT INTO TARGET.indi_result_no_of_copies select * from TARGET.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_result_no_of_copies; ----- Sprint 6 ---- -INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from TARGET.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_hybrid_oa_with_cc; -INSERT INTO TARGET.indi_pub_downloads select * from TARGET.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -COMPUTE STATS TARGET.indi_pub_downloads; -INSERT INTO TARGET.indi_pub_downloads_datasource select * from TARGET.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -COMPUTE STATS TARGET.indi_pub_downloads_datasource; -INSERT INTO TARGET.indi_pub_downloads_year select * from TARGET.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -COMPUTE STATS TARGET.indi_pub_downloads_year; -INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from TARGET.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); -COMPUTE STATS TARGET.indi_pub_downloads_datasource_year; ----- Sprint 7 ---- -INSERT INTO TARGET.indi_pub_gold_oa select * from TARGET.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_gold_oa; -INSERT INTO TARGET.indi_pub_hybrid select * from TARGET.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_hybrid; - -INSERT INTO TARGET.indi_pub_has_preprint select * from TARGET.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_has_preprint; -INSERT INTO TARGET.indi_pub_in_subscribed select * from TARGET.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_pub_in_subscribed; -INSERT INTO TARGET.indi_result_with_pid select * from TARGET.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); -COMPUTE STATS TARGET.indi_result_with_pid; - ---create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_datasets_gold_oa; ---create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_software_gold_oa; -DROP TABLE TARGET.result_new; diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh deleted file mode 100644 index 03aa535e1..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -curl --request GET $1/cache/updateCache -sleep 6h \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml deleted file mode 100644 index 651b6fa13..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml +++ /dev/null @@ -1,129 +0,0 @@ - - - - stats_db_name - the target stats database name - - - stats_db_shadow_name - the name of the shadow schema - - - monitor_db_name - the target monitor db name - - - monitor_db_shadow_name - the name of the shadow monitor db - - - stats_tool_api_url - The url of the API of the stats tool. Is used to trigger the cache update. - - - hive_metastore_uris - hive server metastore URIs - - - hive_jdbc_url - hive server jdbc url - - - hive_timeout - the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. - - - context_api_url - the base url of the context api (https://services.openaire.eu/openaire) - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hive_metastore_uris} - - - hive.txn.timeout - ${hive_timeout} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${jobTracker} - ${nameNode} - monitor.sh - ${stats_db_name} - ${monitor_db_name} - ${wf:appPath()}/scripts/createMonitorDB.sql - monitor.sh - - - - - - - - - ${jobTracker} - ${nameNode} - monitor-post.sh - ${stats_db_name} - ${monitor_db_name} - ${monitor_db_shadow_name} - monitor-post.sh - - - - - - - - ${jobTracker} - ${nameNode} - copyDataToImpalaCluster.sh - ${monitor_db_name} - ${hadoop_user_name} - copyDataToImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - finalizeImpalaCluster.sh - ${monitor_db_name} - ${monitor_db_shadow_name} - finalizeImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - updateCache.sh - ${stats_tool_api_url} - updateCache.sh - - - - - - \ No newline at end of file From e57ecdaf98a6b55f0071271248bbfa9240a57565 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Sun, 30 Apr 2023 17:52:23 +0300 Subject: [PATCH 40/47] Update step20-createMonitorDB.sql Add University of Manitoba --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index d73f329e6..bc72b6c15 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -65,7 +65,8 @@ create table TARGET.result stored as parquet as 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin - 'openorgs____::5e6bf8962665cdd040341171e5c631d8' -- Delft University of Technology + 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology + 'openorgs____::846cb428d3f52a445f7275561a7beb5d' -- University of Manitoba ) )) foo; ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From c3d58e58e1a59070ce710f03f4c4b568debef381 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 2 May 2023 11:54:07 +0300 Subject: [PATCH 41/47] Bug fixes --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index e152eb1ee..a436d0380 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -31,8 +31,8 @@ hdfs dfs -copyFromLocal categories.csv ${TMP} hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} -export HADOOP_USER="dimitris.pierrakos" -export HADOOP_USER_NAME="dimitris.pierrakos" +export HADOOP_USER="oozie" +export HADOOP_USER_NAME="oozie" echo "Creating and populating impala tables" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" From 99ac5bab463af7a04328ba28c9d4d403fe686efd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 4 May 2023 19:38:39 +0200 Subject: [PATCH 42/47] added check to avoid NPE when checking the organization country --- .../eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java index a69b1a8bf..85e446121 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java @@ -78,6 +78,8 @@ public class GetDatasourceFromCountry implements Serializable { Encoders.bean(Organization.class)) .filter( (FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() && + o.getCountry() != null && + o.getCountry().getClassid() != null && o.getCountry().getClassid().length() > 0 && o.getCountry().getClassid().equals(country)); From 8c05f496659c48558973cdc25bf1b8a58813b64d Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 9 May 2023 10:48:34 +0200 Subject: [PATCH 43/47] moved the version as it was before the change --- .../PrepareResultCommunitySet.java | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 19b985964..0fc8cb390 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -72,33 +72,28 @@ public class PrepareResultCommunitySet { String outputPath, OrganizationMap organizationMap) { - Dataset relationAffiliation = readPath(spark, inputPath, Relation.class) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equalsIgnoreCase(ModelConstants.HAS_AUTHOR_INSTITUTION)); + Dataset relation = readPath(spark, inputPath, Relation.class); + relation.createOrReplaceTempView("relation"); - Dataset relationOrganization = readPath(spark, inputPath, Relation.class) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equalsIgnoreCase(ModelConstants.MERGES)); + String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges " + + "FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND lower(relClass) = '" + + ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase() + + "') result_organization " + + "LEFT JOIN (SELECT source, collect_set(target) org_set " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND lower(relClass) = '" + + ModelConstants.MERGES.toLowerCase() + + "' " + + " GROUP BY source) organization_organization " + + "ON result_organization.target = organization_organization.source "; - Dataset result_organizationset = relationAffiliation - .joinWith( - relationOrganization, - relationAffiliation.col("target").equalTo(relationOrganization.col("source")), - "left") - .groupByKey((MapFunction, String>) t2 -> t2._2().getSource(), Encoders.STRING()) - .mapGroups((MapGroupsFunction, ResultOrganizations>) (k, it) -> { - ResultOrganizations rOrgs = new ResultOrganizations(); - rOrgs.setOrgId(k); - Tuple2 first = it.next(); - rOrgs.setResultId(first._1().getSource()); - ArrayList merges = new ArrayList<>(); - merges.add(first._2().getTarget()); - it.forEachRemaining(t -> merges.add(t._2().getTarget())); - rOrgs.setMerges(merges); - return rOrgs; - }, Encoders.bean(ResultOrganizations.class)); + Dataset result_organizationset = spark + .sql(query) + .as(Encoders.bean(ResultOrganizations.class)); result_organizationset .map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class)) From 00d0d162b6d1e5c96f63aa286845a552a5820dcc Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 12 May 2023 12:31:13 +0300 Subject: [PATCH 44/47] Update copyDataToImpalaCluster.sh Added a temporary folder to copy the files to avoid permission issues --- .../graph/stats/oozie_app/copyDataToImpalaCluster.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 843877c90..a99a78965 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -11,14 +11,15 @@ export HADOOP_USER_NAME=$5 function copydb() { db=$1 - + FILE=("hive_wf_tmp_"$RANDOM) + hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ # copy the databases from ocean to impala echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ # change ownership to impala - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db # create the databases impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; @@ -41,12 +42,12 @@ function copydb() { echo "copying data in tables and computing stats" for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; done # deleting the remaining directory from hdfs - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/${db}.db +hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db } STATS_DB=$1 From 86fe886c1ac305da1af29e2ea464a1166c0f3eb8 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 15 May 2023 11:20:51 +0200 Subject: [PATCH 45/47] removed the inverse of the Citing relation --- .../opencitations/CreateActionSetSparkJob.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java index 61bc3fbca..4c658e52f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -114,10 +114,10 @@ public class CreateActionSetSparkJob implements Serializable { if (!citing.equals(cited)) { relationList - .addAll( - getRelations( + .add( + getRelation( citing, - cited)); + cited, ModelConstants.CITES)); if (duplicate && value.getCiting().endsWith(".refs")) { citing = ID_PREFIX + IdentifierFactory @@ -125,7 +125,7 @@ public class CreateActionSetSparkJob implements Serializable { CleaningFunctions .normalizePidValue( "doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs")))); - relationList.addAll(getRelations(citing, cited)); + relationList.add(getRelation(citing, cited, ModelConstants.CITES)); } } From 78b07400c0ba6685433c4b70bb32cb6c18345507 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 15 May 2023 11:37:08 +0200 Subject: [PATCH 46/47] changed test classes --- .../CreateOpenCitationsASTest.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java index 3e4ce750e..523437950 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java @@ -99,7 +99,7 @@ public class CreateOpenCitationsASTest { .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); - assertEquals(62, tmp.count()); + assertEquals(31, tmp.count()); // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); @@ -131,7 +131,7 @@ public class CreateOpenCitationsASTest { .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); - assertEquals(46, tmp.count()); + assertEquals(23, tmp.count()); // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); @@ -241,7 +241,7 @@ public class CreateOpenCitationsASTest { assertEquals("resultResult", r.getRelType()); }); assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count()); - assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count()); + assertEquals(0, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count()); } @@ -318,15 +318,15 @@ public class CreateOpenCitationsASTest { JavaRDD check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1)); - assertEquals(10, check.count()); + assertEquals(5, check.count()); - check.foreach(r -> { - if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) || - r.getSource().equals(doi5) || r.getSource().equals(doi6)) { - assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass()); - assertEquals(doi1, r.getTarget()); - } - }); +// check.foreach(r -> { +// if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) || +// r.getSource().equals(doi5) || r.getSource().equals(doi6)) { +// assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass()); +// assertEquals(doi1, r.getTarget()); +// } +// }); assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count()); check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass())); From b3f9633205ae5bbb7dcb1917dfb68dbc54ed71d7 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 15 May 2023 12:51:44 +0300 Subject: [PATCH 47/47] Update copyDataToImpalaCluster.sh Added option --user to impala-shell command --- .../oozie_app/copyDataToImpalaCluster.sh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index a99a78965..4ff236d07 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -22,28 +22,28 @@ function copydb() { hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db # create the databases - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; - impala-shell -q "INVALIDATE METADATA" + impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA" echo "creating schema for ${db}" - for i in `impala-shell -d ${db} --delimited -q "show tables"`; + for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - # run the same command twice because we may have failures in the first run (due to views pointing to the same db) - for i in `impala-shell -d ${db} --delimited -q "show tables"`; + for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - + impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - # load the data from /tmp in the respective tables echo "copying data in tables and computing stats" - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; + impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; done # deleting the remaining directory from hdfs