Merge pull request 'added missing EOS, Generate tables with parquet-files, instead of csv in the contexts.sh script' (#408) from antonis.lempesis/dnet-hadoop:beta into master

Reviewed-on: #408
2024-03-27 12:02:57 +01:00 · 2024-03-27 12:02:57 +01:00 · 5592ccc37a
parent d16c15da8d 1fee4124e0
commit 5592ccc37a
2 changed files with 17 additions and 9 deletions
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@ -35,12 +35,20 @@ export HADOOP_USER="oozie"
 export HADOOP_USER_NAME="oozie"

 echo "Creating and populating impala tables"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge"
+
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge"
+
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge"

 echo "Cleaning up"
 rm concepts.csv
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
@ -335,8 +335,8 @@ select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
 from allresults ar
         join result_fair rf on rf.organization=ar.organization; /*EOS*/

-DROP VIEW result_fair;
-DROP VIEW allresults;
+DROP VIEW result_fair; /*EOS*/
+DROP VIEW allresults; /*EOS*/

 CREATE TEMPORARY VIEW result_fair as
    select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro
@ -1006,7 +1006,7 @@ left outer join ${stats_db_name}.organization o on o.id=ro.organization
 left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id
 left outer join ${stats_db_name}.project p on p.id=rp.project
 left outer join ${stats_db_name}.funder f on f.name=p.funder
-where coalesce(o.country, f.country) IS NOT NULL;
+where coalesce(o.country, f.country) IS NOT NULL; /*EOS*/

 drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
 create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as