stats_wf_extensions_and_corrections #28

Merged
claudio.atzori merged 22 commits from spyros/dnet-hadoop:stats_wf_extensions_and_corrections into master 2020-07-27 16:02:04 +02:00
2 changed files with 16 additions and 48 deletions
Showing only changes of commit c9cfc165d9 - Show all commits

View File

@ -1,19 +1,7 @@
---------------------------------------------------- ----------------------------------------------------
-- Shortcuts for various definitions in stats db --- -- Shortcuts for various definitions in stats db ---
-- since these statements are executed using Impala,
-- we'll have to compute the stats for the tables we use
---------------------------------------------------- ----------------------------------------------------
COMPUTE STATS result;
COMPUTE STATS result_sources;
COMPUTE STATS datasource;
COMPUTE STATS result_datasources;
COMPUTE STATS datasource_sources;
COMPUTE STATS country;
COMPUTE STATS result_organization;
COMPUTE STATS organization;
COMPUTE STATS datasource_organizations;
-- Peer reviewed: -- Peer reviewed:
-- Results that have been collected from Crossref -- Results that have been collected from Crossref
create table ${stats_db_name}.result_peerreviewed as create table ${stats_db_name}.result_peerreviewed as
@ -28,9 +16,8 @@ from peer_reviewed
union all union all
select distinct r.id as id, false as peer_reviewed select distinct r.id as id, false as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
where r.id not in (select id from peer_reviewed); left outer join peer_reviewed pr on pr.id=r.id
where pr.id is null;
COMPUTE STATS result_peerreviewed;
-- Green OA: -- Green OA:
-- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal. -- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal.
@ -40,20 +27,21 @@ with result_green as (
from ${stats_db_name}.result r from ${stats_db_name}.result r
join ${stats_db_name}.result_datasources rd on rd.id=r.id join ${stats_db_name}.result_datasources rd on rd.id=r.id
join ${stats_db_name}.datasource d on d.id=rd.datasource join ${stats_db_name}.datasource d on d.id=rd.datasource
where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and not exists ( left outer join (
select 1 from ${stats_db_name}.result_datasources rd select rd.id from ${stats_db_name}.result_datasources rd
join ${stats_db_name}.datasource d on rd.datasource=d.id join ${stats_db_name}.datasource d on rd.datasource=d.id
join ${stats_db_name}.datasource_sources sds on sds.id=d.id join ${stats_db_name}.datasource_sources sds on sds.id=d.id
join ${stats_db_name}.datasource sd on sd.id=sds.datasource join ${stats_db_name}.datasource sd on sd.id=sds.datasource
where sd.name='DOAJ-ARTICLES' and rd.id=r.id)) where sd.name='DOAJ-ARTICLES'
) as doaj on doaj.id=r.id
where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null)
select distinct result_green.id, true as green select distinct result_green.id, true as green
from result_green from result_green
union all union all
select distinct r.id as id, false as green select distinct r.id as id, false as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
where r.id not in (select id from result_green); left outer join result_green rg on rg.id=r.id
where rg.id is null;
COMPUTE STATS result_greenoa;
-- GOLD OA: -- GOLD OA:
-- OA results that have been harvested from a DOAJ journal. -- OA results that have been harvested from a DOAJ journal.
@ -73,8 +61,6 @@ select distinct r.id, false as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
where r.id not in (select id from result_gold); where r.id not in (select id from result_gold);
COMPUTE STATS result_gold;
-- shortcut result-country through the organization affiliation -- shortcut result-country through the organization affiliation
create table ${stats_db_name}.result_affiliated_country as create table ${stats_db_name}.result_affiliated_country as
select r.id as id, o.country as country select r.id as id, o.country as country
@ -83,8 +69,6 @@ join ${stats_db_name}.result_organization ro on ro.id=r.id
join ${stats_db_name}.organization o on o.id=ro.organization join ${stats_db_name}.organization o on o.id=ro.organization
where o.country is not null and o.country!=''; where o.country is not null and o.country!='';
COMPUTE STATS result_affiliated_country;
-- shortcut result-country through datasource of deposition -- shortcut result-country through datasource of deposition
create table ${stats_db_name}.result_deposited_country as create table ${stats_db_name}.result_deposited_country as
select r.id as id, o.country as country select r.id as id, o.country as country
@ -94,5 +78,3 @@ join ${stats_db_name}.datasource d on d.id=rd.datasource
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
join ${stats_db_name}.organization o on o.id=dor.organization join ${stats_db_name}.organization o on o.id=dor.organization
where o.country is not null and o.country!=''; where o.country is not null and o.country!='';
COMPUTE STATS result_deposited_country;

View File

@ -215,31 +215,17 @@
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param> <param>openaire_db_name=${openaire_db_name}</param>
</hive2> </hive2>
<ok to="Step15_5"/>
<error to="Kill"/>
</action>
<action name="Step15_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step15_5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step16"/> <ok to="Step16"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step16"> <action name="Step16">
<shell xmlns="uri:oozie:shell-action:0.1"> <hive2 xmlns="uri:oozie:hive2-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <jdbc-url>${hive_jdbc_url}</jdbc-url>
<name-node>${nameNode}</name-node> <script>scripts/step16.sql</script>
<exec>impala-shell.sh</exec> <param>stats_db_name=${stats_db_name}</param>
<argument>${stats_db_name}</argument> <param>openaire_db_name=${openaire_db_name}</param>
<argument>step16.sql</argument> </hive2>
<argument>/user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step16.sql</argument>
<file>impala-shell.sh</file>
</shell>
<ok to="Step16_5"/> <ok to="Step16_5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>