Merge pull request 'added missing EOS, Generate tables with parquet-files, instead of csv in the contexts.sh script' (#408) from antonis.lempesis/dnet-hadoop:beta into master

Reviewed-on: #408
This commit is contained in:
Claudio Atzori 2024-03-27 12:02:57 +01:00
commit 5592ccc37a
2 changed files with 17 additions and 9 deletions

View File

@ -35,12 +35,20 @@ export HADOOP_USER="oozie"
export HADOOP_USER_NAME="oozie" export HADOOP_USER_NAME="oozie"
echo "Creating and populating impala tables" echo "Creating and populating impala tables"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv"
hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge"
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv"
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv"
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge"
echo "Cleaning up" echo "Cleaning up"
rm concepts.csv rm concepts.csv

View File

@ -335,8 +335,8 @@ select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar from allresults ar
join result_fair rf on rf.organization=ar.organization; /*EOS*/ join result_fair rf on rf.organization=ar.organization; /*EOS*/
DROP VIEW result_fair; DROP VIEW result_fair; /*EOS*/
DROP VIEW allresults; DROP VIEW allresults; /*EOS*/
CREATE TEMPORARY VIEW result_fair as CREATE TEMPORARY VIEW result_fair as
select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro
@ -1006,7 +1006,7 @@ left outer join ${stats_db_name}.organization o on o.id=ro.organization
left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
left outer join ${stats_db_name}.funder f on f.name=p.funder left outer join ${stats_db_name}.funder f on f.name=p.funder
where coalesce(o.country, f.country) IS NOT NULL; where coalesce(o.country, f.country) IS NOT NULL; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as