forked from D-Net/dnet-hadoop
Separated impala from hive steps
This commit is contained in:
parent
de49173420
commit
e6f50de6ef
|
@ -0,0 +1,6 @@
|
|||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
create table if not exists ${stats_db_name}.datasource_sources AS
|
||||
select substr(d.id,4) as id, substr(cf.key, 4) as datasource
|
||||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||
where d.datainfo.deletedbyinference=false;
|
|
@ -2,7 +2,6 @@
|
|||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
|
||||
-- Peer reviewed:
|
||||
-- Results that have been collected from Crossref
|
||||
create table ${stats_db_name}.result_peerreviewed as
|
||||
|
@ -75,59 +74,3 @@ join ${stats_db_name}.datasource d on d.id=rd.datasource
|
|||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
where o.country is not null and o.country!='';
|
||||
|
||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
CREATE TABLE result_tmp (
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
`date` STRING,
|
||||
`year` INT,
|
||||
bestlicence STRING,
|
||||
access_mode STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING ,
|
||||
peer_reviewed BOOLEAN,
|
||||
green BOOLEAN,
|
||||
gold BOOLEAN)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.publication r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.dataset r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.software r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.otherresearchproduct r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
drop table if exists ${stats_db_name}.result;
|
||||
drop view if exists ${stats_db_name}.result;
|
||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||
drop table ${stats_db_name}.result_tmp;
|
|
@ -0,0 +1,55 @@
|
|||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
CREATE TABLE result_tmp (
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
`date` STRING,
|
||||
`year` INT,
|
||||
bestlicence STRING,
|
||||
access_mode STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING ,
|
||||
peer_reviewed BOOLEAN,
|
||||
green BOOLEAN,
|
||||
gold BOOLEAN)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.publication r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.dataset r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.software r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.otherresearchproduct r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
drop table if exists ${stats_db_name}.result;
|
||||
drop view if exists ${stats_db_name}.result;
|
||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||
drop table ${stats_db_name}.result_tmp;
|
|
@ -4,7 +4,7 @@
|
|||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization;
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname as legalshortname, o.country.classid as country
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country
|
||||
FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALSE;
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
|
||||
|
|
|
@ -215,14 +215,39 @@
|
|||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step15_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step15_5">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step15_5.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step16"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>impala-shell.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>step16.sql</argument>
|
||||
<argument>/user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step16.sql</argument>
|
||||
<file>impala-shell.sh</file>
|
||||
</shell>
|
||||
<ok to="Step16_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16_5">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step16.sql</script>
|
||||
<script>scripts/step16_5.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
|
|
Loading…
Reference in New Issue