forked from D-Net/dnet-hadoop
Separated impala from hive steps
This commit is contained in:
parent
de49173420
commit
e6f50de6ef
|
@ -0,0 +1,6 @@
|
||||||
|
-- datasource sources:
|
||||||
|
-- where the datasource info have been collected from.
|
||||||
|
create table if not exists ${stats_db_name}.datasource_sources AS
|
||||||
|
select substr(d.id,4) as id, substr(cf.key, 4) as datasource
|
||||||
|
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||||
|
where d.datainfo.deletedbyinference=false;
|
|
@ -2,7 +2,6 @@
|
||||||
-- Shortcuts for various definitions in stats db ---
|
-- Shortcuts for various definitions in stats db ---
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
-- Peer reviewed:
|
-- Peer reviewed:
|
||||||
-- Results that have been collected from Crossref
|
-- Results that have been collected from Crossref
|
||||||
create table ${stats_db_name}.result_peerreviewed as
|
create table ${stats_db_name}.result_peerreviewed as
|
||||||
|
@ -74,60 +73,4 @@ join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||||
where o.country is not null and o.country!='';
|
where o.country is not null and o.country!='';
|
||||||
|
|
||||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
|
||||||
-- peer reviewed)
|
|
||||||
drop table if exists ${stats_db_name}.result_tmp;
|
|
||||||
CREATE TABLE result_tmp (
|
|
||||||
id STRING,
|
|
||||||
title STRING,
|
|
||||||
publisher STRING,
|
|
||||||
journal STRING,
|
|
||||||
`date` STRING,
|
|
||||||
`year` INT,
|
|
||||||
bestlicence STRING,
|
|
||||||
access_mode STRING,
|
|
||||||
embargo_end_date STRING,
|
|
||||||
delayed BOOLEAN,
|
|
||||||
authors INT,
|
|
||||||
source STRING,
|
|
||||||
abstract BOOLEAN,
|
|
||||||
type STRING ,
|
|
||||||
peer_reviewed BOOLEAN,
|
|
||||||
green BOOLEAN,
|
|
||||||
gold BOOLEAN)
|
|
||||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.publication r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.dataset r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.software r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.otherresearchproduct r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.result;
|
|
||||||
drop view if exists ${stats_db_name}.result;
|
|
||||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
|
||||||
drop table ${stats_db_name}.result_tmp;
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||||
|
-- peer reviewed)
|
||||||
|
drop table if exists ${stats_db_name}.result_tmp;
|
||||||
|
CREATE TABLE result_tmp (
|
||||||
|
id STRING,
|
||||||
|
title STRING,
|
||||||
|
publisher STRING,
|
||||||
|
journal STRING,
|
||||||
|
`date` STRING,
|
||||||
|
`year` INT,
|
||||||
|
bestlicence STRING,
|
||||||
|
access_mode STRING,
|
||||||
|
embargo_end_date STRING,
|
||||||
|
delayed BOOLEAN,
|
||||||
|
authors INT,
|
||||||
|
source STRING,
|
||||||
|
abstract BOOLEAN,
|
||||||
|
type STRING ,
|
||||||
|
peer_reviewed BOOLEAN,
|
||||||
|
green BOOLEAN,
|
||||||
|
gold BOOLEAN)
|
||||||
|
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.publication r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.dataset r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.software r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.otherresearchproduct r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
drop table if exists ${stats_db_name}.result;
|
||||||
|
drop view if exists ${stats_db_name}.result;
|
||||||
|
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||||
|
drop table ${stats_db_name}.result_tmp;
|
|
@ -4,7 +4,7 @@
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.organization;
|
DROP TABLE IF EXISTS ${stats_db_name}.organization;
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname as legalshortname, o.country.classid as country
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country
|
||||||
FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALSE;
|
FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALSE;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
|
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
|
||||||
|
|
|
@ -215,14 +215,39 @@
|
||||||
<param>stats_db_name=${stats_db_name}</param>
|
<param>stats_db_name=${stats_db_name}</param>
|
||||||
<param>openaire_db_name=${openaire_db_name}</param>
|
<param>openaire_db_name=${openaire_db_name}</param>
|
||||||
</hive2>
|
</hive2>
|
||||||
|
<ok to="Step15_5"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="Step15_5">
|
||||||
|
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||||
|
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||||
|
<script>scripts/step15_5.sql</script>
|
||||||
|
<param>stats_db_name=${stats_db_name}</param>
|
||||||
|
<param>openaire_db_name=${openaire_db_name}</param>
|
||||||
|
</hive2>
|
||||||
<ok to="Step16"/>
|
<ok to="Step16"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="Step16">
|
<action name="Step16">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>impala-shell.sh</exec>
|
||||||
|
<argument>${stats_db_name}</argument>
|
||||||
|
<argument>step16.sql</argument>
|
||||||
|
<argument>/user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step16.sql</argument>
|
||||||
|
<file>impala-shell.sh</file>
|
||||||
|
</shell>
|
||||||
|
<ok to="Step16_5"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="Step16_5">
|
||||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||||
<script>scripts/step16.sql</script>
|
<script>scripts/step16_5.sql</script>
|
||||||
<param>stats_db_name=${stats_db_name}</param>
|
<param>stats_db_name=${stats_db_name}</param>
|
||||||
<param>openaire_db_name=${openaire_db_name}</param>
|
<param>openaire_db_name=${openaire_db_name}</param>
|
||||||
</hive2>
|
</hive2>
|
||||||
|
|
Loading…
Reference in New Issue