forked from D-Net/dnet-hadoop
moving data to impala cluster and creating shadow databases there
This commit is contained in:
parent
d05210ba99
commit
915f758c82
|
@ -11,10 +11,4 @@ export SHADOW=$2
|
|||
|
||||
echo "Updating shadow database"
|
||||
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
|
||||
hive -f foo
|
||||
hive -e "create database if not exists ${SHADOW}"
|
||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
||||
hive -f foo
|
||||
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db ready!"
|
||||
hive -f foo
|
|
@ -17,12 +17,4 @@ hdfs dfs -copyToLocal $4
|
|||
echo "Creating monitor database"
|
||||
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
|
||||
hive -f foo
|
||||
echo "Impala shell finished"
|
||||
|
||||
echo "Updating shadow monitor database"
|
||||
hive -e "create database if not exists ${SHADOW}"
|
||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
||||
hive -f foo
|
||||
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db ready!"
|
||||
echo "Impala shell finished"
|
|
@ -12,12 +12,4 @@ export SHADOW=$3
|
|||
|
||||
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||
hive -f foo
|
||||
echo "Impala shell finished"
|
||||
|
||||
echo "Updating shadow observatory database"
|
||||
hive -e "create database if not exists ${SHADOW}"
|
||||
hive --database ${SHADOW} -e "show tables" | grep -v WARN | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" > foo
|
||||
hive -f foo
|
||||
hive -d ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db ready!"
|
||||
echo "Impala shell finished"
|
|
@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE;
|
|||
|
||||
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
||||
-- Creating a temporary dual table that will be removed after the following insert
|
||||
CREATE TABLE ${stats_db_name}.dual
|
||||
(
|
||||
dummy CHAR(1)
|
||||
);
|
||||
INSERT INTO ${stats_db_name}.dual
|
||||
VALUES ('X');
|
||||
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
|
||||
|
||||
INSERT INTO ${stats_db_name}.dual VALUES ('X');
|
||||
|
||||
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
||||
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
||||
SELECT 'other',
|
||||
|
@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual
|
|||
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
|
||||
DROP TABLE ${stats_db_name}.dual;
|
||||
|
||||
UPDATE ${stats_db_name}.datasource_tmp
|
||||
SET name='Other'
|
||||
WHERE name = 'Unknown Repository';
|
||||
UPDATE ${stats_db_name}.datasource_tmp
|
||||
SET yearofvalidation=null
|
||||
WHERE yearofvalidation = '-1';
|
||||
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
|
||||
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
||||
|
@ -91,8 +85,7 @@ FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS
|
|||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'datasourceOrganization'
|
||||
and r.datainfo.deletedbyinference = false;
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false;
|
||||
|
||||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
|
@ -101,6 +94,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
|||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||
where d.datainfo.deletedbyinference = false;
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
||||
SELECT datasource AS id, id AS result
|
||||
FROM ${stats_db_name}.result_datasources;
|
|
@ -365,11 +365,43 @@
|
|||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>observatory-post.sh</file>
|
||||
</shell>
|
||||
<ok to="Step22"/>
|
||||
<ok to="step22-copyDataToImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step22">
|
||||
<action name="step22-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<argument>${external_stats_db_name}</argument>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<file>copyDataToImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="step23-finalizeImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="step23-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${stats_db_shadow_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="Step24-updateCache"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step24-updateCache">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
|
|
Loading…
Reference in New Issue