first

2021-11-26 15:40:40 +02:00 · 2021-11-26 15:40:40 +02:00 · e84dd5fe26
parent 2a52a42169
commit e84dd5fe26
6 changed files with 12 additions and 22 deletions
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@ -30,18 +30,14 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP}
 hdfs dfs -chmod -R 777 ${TMP}

 echo "Creating and populating impala tables"
-impala-shell -q "invalidate metadata"
-impala-shell -d ${TARGET_DB} -q "invalidate metadata"
-impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
-impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
-impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
-impala-shell -d ${TARGET_DB} -q "invalidate metadata"
-impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
-impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
-impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
+hive -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
+hive -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
+hive -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
+hive -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
+hive -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
+hive -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"

 echo "Cleaning up"
-hdfs dfs -rm -f -r -skipTrash ${TMP}
 rm concepts.csv
 rm categories.csv
 rm contexts.csv
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh
@ -10,9 +10,7 @@ export SOURCE=$1
 export SHADOW=$2

 echo "Updating shadow database"
-impala-shell -q "invalidate metadata"
-impala-shell -d ${SOURCE} -q "invalidate metadata"
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
+hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f -
 impala-shell -q "create database if not exists ${SHADOW}"
 impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
 impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
                                        AND pr.id = p.id
                                        AND to_date(r.date) - to_date(p.enddate) > 0);

-CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
+CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS
 SELECT result_projects.id          AS result,
       result_projects.project     AS project_results,
       result.date                 as resultdate,
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed
 from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
 where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;

-CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
+CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as
 select * from ${stats_db_name}.publication_refereed
 union all
 select * from ${stats_db_name}.dataset_refereed
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql
@ -29,8 +29,6 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
 from rcount
 group by rcount.pid;

-create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
-
 create table ${stats_db_name}.result_instance stored as parquet as
 select distinct r.*
 from (
@ -39,12 +37,10 @@ from (
         from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
 join ${stats_db_name}.result res on res.id=r.id;

-create table ${stats_db_name}.result_apc as
+create table ${stats_db_name}.result_apc STORED AS PARQUET as
 select r.id, r.amount, r.currency
 from (
         select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
         from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
 join ${stats_db_name}.result res on res.id=r.id
 where r.amount is not null;
-
-create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
@ -102,6 +102,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
 from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
 where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;

-CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
+CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS
 SELECT datasource AS id, id AS result
 FROM ${stats_db_name}.result_datasources;