From 036ba03fcdf3be0d4b574d79c6e430dbebc4b958 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 26 Mar 2024 13:29:04 +0200 Subject: [PATCH 1/2] Generate tables with parquet-files, instead of csv, in "dhp-stats-update/.../contexts.sh" script. --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index a436d0380..971b0da3f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -35,12 +35,20 @@ export HADOOP_USER="oozie" export HADOOP_USER_NAME="oozie" echo "Creating and populating impala tables" -hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" -hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" -hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" -hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" -hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" -hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" +hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','" +hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv" +hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv" +hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge" + +hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','" +hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv" +hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv" +hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge" + +hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','" +hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv" +hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv" +hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge" echo "Cleaning up" rm concepts.csv From 1fee4124e0ca335cbdc3e8768366a465ec3cb924 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 27 Mar 2024 12:58:25 +0200 Subject: [PATCH 2/2] added missing EOS --- .../oozie_app/scripts/step16-createIndicatorsTables.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 70cde6481..a091fadde 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -335,8 +335,8 @@ select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ -DROP VIEW result_fair; -DROP VIEW allresults; +DROP VIEW result_fair; /*EOS*/ +DROP VIEW allresults; /*EOS*/ CREATE TEMPORARY VIEW result_fair as select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro @@ -1006,7 +1006,7 @@ left outer join ${stats_db_name}.organization o on o.id=ro.organization left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.funder f on f.name=p.funder -where coalesce(o.country, f.country) IS NOT NULL; +where coalesce(o.country, f.country) IS NOT NULL; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as