moving data to impala cluster and creating shadow databases there

This commit is contained in:
Antonis Lempesis 2021-12-13 16:26:14 +02:00
parent d05210ba99
commit 915f758c82
5 changed files with 45 additions and 42 deletions

View File

@ -12,9 +12,3 @@ export SHADOW=$2
echo "Updating shadow database" echo "Updating shadow database"
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
hive -f foo hive -f foo
hive -e "create database if not exists ${SHADOW}"
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
hive -f foo
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo
echo "Shadow db ready!"

View File

@ -18,11 +18,3 @@ echo "Creating monitor database"
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
hive -f foo hive -f foo
echo "Impala shell finished" echo "Impala shell finished"
echo "Updating shadow monitor database"
hive -e "create database if not exists ${SHADOW}"
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
hive -f foo
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
hive -f foo
echo "Shadow db ready!"

View File

@ -13,11 +13,3 @@ export SHADOW=$3
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive -f foo hive -f foo
echo "Impala shell finished" echo "Impala shell finished"
echo "Updating shadow observatory database"
hive -e "create database if not exists ${SHADOW}"
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
hive -f foo
hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
hive -f foo
echo "Shadow db ready!"

View File

@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE;
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert -- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
(
dummy CHAR(1) INSERT INTO ${stats_db_name}.dual VALUES ('X');
);
INSERT INTO ${stats_db_name}.dual
VALUES ('X');
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
SELECT 'other', SELECT 'other',
@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual; DROP TABLE ${stats_db_name}.dual;
UPDATE ${stats_db_name}.datasource_tmp UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
SET name='Other' UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
WHERE name = 'Unknown Repository';
UPDATE ${stats_db_name}.datasource_tmp
SET yearofvalidation=null
WHERE yearofvalidation = '-1';
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT substr(d.id, 4) AS id, langs.languages AS language
@ -91,8 +85,7 @@ FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false;
and r.datainfo.deletedbyinference = false;
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
@ -101,6 +94,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false; where d.datainfo.deletedbyinference = false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources;

View File

@ -365,11 +365,43 @@
<argument>${observatory_db_shadow_name}</argument> <argument>${observatory_db_shadow_name}</argument>
<file>observatory-post.sh</file> <file>observatory-post.sh</file>
</shell> </shell>
<ok to="Step22"/> <ok to="step22-copyDataToImpalaCluster"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step22"> <action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${external_stats_db_name}</argument>
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="step23-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="Step24-updateCache"/>
<error to="Kill"/>
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>