datasource table creation split in steps

This commit is contained in:
Antonis Lempesis 2024-09-30 17:12:21 +03:00
parent 619aa34a15
commit f3c179658a
2 changed files with 18 additions and 10 deletions

View File

@ -32,7 +32,7 @@ select distinct * from (
from SOURCE.result r from SOURCE.result r
join SOURCE.result_projects rp on rp.id=r.id join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project join SOURCE.project p on p.id=rp.project
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder join TARGET.irish_funders irf on irf.funder=p.funder
union all union all
select r.* select r.*
from SOURCE.result r from SOURCE.result r
@ -238,4 +238,4 @@ create table TARGET.indi_pub_publicly_funded stored as parquet as select * from
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);

View File

@ -8,14 +8,20 @@ set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------------ ------------------------------------------------------------
------------------------------------------------------------ ------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.harested_datasources purge; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.piwik_datasource purge; /*EOS*/
create table ${stats_db_name}.harested_datasources stored as parquet as
select distinct inst.hostedby.key as d_id
from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst; /*EOS*/
create table ${stats_db_name}.piwik_datasource stored as parquet as
select id, split(originalidd, '\\:')[1] as piwik_id
from ${openaire_db_name}.datasource
lateral view explode(originalid) temp as originalidd
where originalidd like "piwik:%"; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource stored as parquet as CREATE TABLE ${stats_db_name}.datasource stored as parquet as
with piwik_datasource as (
select id, split(originalidd, '\\:')[1] as piwik_id
from ${openaire_db_name}.datasource
lateral view explode(originalid) temp as originalidd
where originalidd like "piwik:%"
)
select /*+ COALESCE(100) */ select /*+ COALESCE(100) */
substr(dtrce.id, 4) as id, substr(dtrce.id, 4) as id,
case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name, case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name,
@ -31,10 +37,12 @@ select /*+ COALESCE(100) */
dtrce.journal.issnprinted as issn_printed, dtrce.journal.issnprinted as issn_printed,
dtrce.journal.issnonline as issn_online dtrce.journal.issnonline as issn_online
from ${openaire_db_name}.datasource dtrce from ${openaire_db_name}.datasource dtrce
left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id left outer join ${stats_db_name}.harested_datasources res on res.d_id=dtrce.id
left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id left outer join ${stats_db_name}.piwik_datasource piwik_d on piwik_d.id=dtrce.id
where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/ where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/
drop table ${stats_db_name}.harested_datasources; /*EOS*/
drop table ${stats_db_name}.piwik_datasource; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/