forked from D-Net/dnet-hadoop
first
This commit is contained in:
parent
2a52a42169
commit
e84dd5fe26
|
@ -30,18 +30,14 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP}
|
||||||
hdfs dfs -chmod -R 777 ${TMP}
|
hdfs dfs -chmod -R 777 ${TMP}
|
||||||
|
|
||||||
echo "Creating and populating impala tables"
|
echo "Creating and populating impala tables"
|
||||||
impala-shell -q "invalidate metadata"
|
hive -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
|
||||||
impala-shell -d ${TARGET_DB} -q "invalidate metadata"
|
hive -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
|
||||||
impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
|
hive -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
|
||||||
impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
|
hive -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
|
||||||
impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
|
hive -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
|
||||||
impala-shell -d ${TARGET_DB} -q "invalidate metadata"
|
hive -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
|
||||||
impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
|
|
||||||
impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
|
|
||||||
impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
|
|
||||||
|
|
||||||
echo "Cleaning up"
|
echo "Cleaning up"
|
||||||
hdfs dfs -rm -f -r -skipTrash ${TMP}
|
|
||||||
rm concepts.csv
|
rm concepts.csv
|
||||||
rm categories.csv
|
rm categories.csv
|
||||||
rm contexts.csv
|
rm contexts.csv
|
||||||
|
|
|
@ -10,9 +10,7 @@ export SOURCE=$1
|
||||||
export SHADOW=$2
|
export SHADOW=$2
|
||||||
|
|
||||||
echo "Updating shadow database"
|
echo "Updating shadow database"
|
||||||
impala-shell -q "invalidate metadata"
|
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" | impala-shell -c -f -
|
||||||
impala-shell -d ${SOURCE} -q "invalidate metadata"
|
|
||||||
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
|
|
||||||
impala-shell -q "create database if not exists ${SHADOW}"
|
impala-shell -q "create database if not exists ${SHADOW}"
|
||||||
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
|
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
|
||||||
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
|
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
|
||||||
|
|
|
@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
|
||||||
AND pr.id = p.id
|
AND pr.id = p.id
|
||||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication STORED AS PARQUET AS
|
||||||
SELECT result_projects.id AS result,
|
SELECT result_projects.id AS result,
|
||||||
result_projects.project AS project_results,
|
result_projects.project AS project_results,
|
||||||
result.date as resultdate,
|
result.date as resultdate,
|
||||||
|
|
|
@ -26,7 +26,7 @@ select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed STORED AS PARQUET as
|
||||||
select * from ${stats_db_name}.publication_refereed
|
select * from ${stats_db_name}.publication_refereed
|
||||||
union all
|
union all
|
||||||
select * from ${stats_db_name}.dataset_refereed
|
select * from ${stats_db_name}.dataset_refereed
|
||||||
|
|
|
@ -29,8 +29,6 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
|
||||||
from rcount
|
from rcount
|
||||||
group by rcount.pid;
|
group by rcount.pid;
|
||||||
|
|
||||||
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
|
||||||
|
|
||||||
create table ${stats_db_name}.result_instance stored as parquet as
|
create table ${stats_db_name}.result_instance stored as parquet as
|
||||||
select distinct r.*
|
select distinct r.*
|
||||||
from (
|
from (
|
||||||
|
@ -39,12 +37,10 @@ from (
|
||||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
||||||
join ${stats_db_name}.result res on res.id=r.id;
|
join ${stats_db_name}.result res on res.id=r.id;
|
||||||
|
|
||||||
create table ${stats_db_name}.result_apc as
|
create table ${stats_db_name}.result_apc STORED AS PARQUET as
|
||||||
select r.id, r.amount, r.currency
|
select r.id, r.amount, r.currency
|
||||||
from (
|
from (
|
||||||
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
||||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
||||||
join ${stats_db_name}.result res on res.id=r.id
|
join ${stats_db_name}.result res on res.id=r.id
|
||||||
where r.amount is not null;
|
where r.amount is not null;
|
||||||
|
|
||||||
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;
|
|
|
@ -102,6 +102,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
||||||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||||
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
|
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS
|
||||||
SELECT datasource AS id, id AS result
|
SELECT datasource AS id, id AS result
|
||||||
FROM ${stats_db_name}.result_datasources;
|
FROM ${stats_db_name}.result_datasources;
|
Loading…
Reference in New Issue