forked from antonis.lempesis/dnet-hadoop
moving data to impala cluster and creating shadow databases there
This commit is contained in:
parent
d05210ba99
commit
915f758c82
|
@ -12,9 +12,3 @@ export SHADOW=$2
|
||||||
echo "Updating shadow database"
|
echo "Updating shadow database"
|
||||||
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
|
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
|
||||||
hive -f foo
|
hive -f foo
|
||||||
hive -e "create database if not exists ${SHADOW}"
|
|
||||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
echo "Shadow db ready!"
|
|
|
@ -18,11 +18,3 @@ echo "Creating monitor database"
|
||||||
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
|
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
|
||||||
hive -f foo
|
hive -f foo
|
||||||
echo "Impala shell finished"
|
echo "Impala shell finished"
|
||||||
|
|
||||||
echo "Updating shadow monitor database"
|
|
||||||
hive -e "create database if not exists ${SHADOW}"
|
|
||||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
echo "Shadow db ready!"
|
|
|
@ -13,11 +13,3 @@ export SHADOW=$3
|
||||||
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||||
hive -f foo
|
hive -f foo
|
||||||
echo "Impala shell finished"
|
echo "Impala shell finished"
|
||||||
|
|
||||||
echo "Updating shadow observatory database"
|
|
||||||
hive -e "create database if not exists ${SHADOW}"
|
|
||||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
|
|
||||||
hive -f foo
|
|
||||||
echo "Shadow db ready!"
|
|
|
@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE;
|
||||||
|
|
||||||
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
||||||
-- Creating a temporary dual table that will be removed after the following insert
|
-- Creating a temporary dual table that will be removed after the following insert
|
||||||
CREATE TABLE ${stats_db_name}.dual
|
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
|
||||||
(
|
|
||||||
dummy CHAR(1)
|
INSERT INTO ${stats_db_name}.dual VALUES ('X');
|
||||||
);
|
|
||||||
INSERT INTO ${stats_db_name}.dual
|
|
||||||
VALUES ('X');
|
|
||||||
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
||||||
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
||||||
SELECT 'other',
|
SELECT 'other',
|
||||||
|
@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual
|
||||||
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
|
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
|
||||||
DROP TABLE ${stats_db_name}.dual;
|
DROP TABLE ${stats_db_name}.dual;
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.datasource_tmp
|
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
|
||||||
SET name='Other'
|
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
|
||||||
WHERE name = 'Unknown Repository';
|
|
||||||
UPDATE ${stats_db_name}.datasource_tmp
|
|
||||||
SET yearofvalidation=null
|
|
||||||
WHERE yearofvalidation = '-1';
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
||||||
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
||||||
|
@ -91,8 +85,7 @@ FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS
|
||||||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||||
FROM ${openaire_db_name}.relation r
|
FROM ${openaire_db_name}.relation r
|
||||||
WHERE r.reltype = 'datasourceOrganization'
|
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false;
|
||||||
and r.datainfo.deletedbyinference = false;
|
|
||||||
|
|
||||||
-- datasource sources:
|
-- datasource sources:
|
||||||
-- where the datasource info have been collected from.
|
-- where the datasource info have been collected from.
|
||||||
|
@ -101,6 +94,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
||||||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||||
where d.datainfo.deletedbyinference = false;
|
where d.datainfo.deletedbyinference = false;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
||||||
SELECT datasource AS id, id AS result
|
SELECT datasource AS id, id AS result
|
||||||
FROM ${stats_db_name}.result_datasources;
|
FROM ${stats_db_name}.result_datasources;
|
|
@ -365,11 +365,43 @@
|
||||||
<argument>${observatory_db_shadow_name}</argument>
|
<argument>${observatory_db_shadow_name}</argument>
|
||||||
<file>observatory-post.sh</file>
|
<file>observatory-post.sh</file>
|
||||||
</shell>
|
</shell>
|
||||||
<ok to="Step22"/>
|
<ok to="step22-copyDataToImpalaCluster"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="Step22">
|
<action name="step22-copyDataToImpalaCluster">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>copyDataToImpalaCluster.sh</exec>
|
||||||
|
<argument>${external_stats_db_name}</argument>
|
||||||
|
<argument>${stats_db_name}</argument>
|
||||||
|
<argument>${monitor_db_name}</argument>
|
||||||
|
<argument>${observatory_db_name}</argument>
|
||||||
|
<file>copyDataToImpalaCluster.sh</file>
|
||||||
|
</shell>
|
||||||
|
<ok to="step23-finalizeImpalaCluster"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="step23-finalizeImpalaCluster">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>finalizeImpalaCluster.sh</exec>
|
||||||
|
<argument>${stats_db_name}</argument>
|
||||||
|
<argument>${stats_db_shadow_name}</argument>
|
||||||
|
<argument>${monitor_db_name}</argument>
|
||||||
|
<argument>${monitor_db_shadow_name}</argument>
|
||||||
|
<argument>${observatory_db_name}</argument>
|
||||||
|
<argument>${observatory_db_shadow_name}</argument>
|
||||||
|
<file>finalizeImpalaCluster.sh</file>
|
||||||
|
</shell>
|
||||||
|
<ok to="Step24-updateCache"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="Step24-updateCache">
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
|
|
Loading…
Reference in New Issue