Merge pull request '[stats wf] various fixes, organization ids for inst. dashboard' (#205) from antonis.lempesis/dnet-hadoop:beta into beta

Reviewed-on: D-Net/dnet-hadoop#205
This commit is contained in:
Claudio Atzori 2022-03-28 22:03:49 +02:00
commit 9fa3dd78fe
17 changed files with 90 additions and 103 deletions

View File

@ -49,8 +49,5 @@ select * from openaire_prod_usage_stats.views_stats;
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create table ${stats_db_name}.creation_date as create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date; select date_format(current_date(), 'dd-MM-yyyy') as date;
--
-- ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS FOR COLUMNS;

View File

@ -113,7 +113,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; AND project.id = result_projects.project;
-- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,7 +5,7 @@
-- Sources related tables/views -- Sources related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
@ -16,7 +16,7 @@ LEFT OUTER JOIN
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
@ -27,7 +27,7 @@ LEFT OUTER JOIN
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
@ -38,7 +38,7 @@ LEFT OUTER JOIN
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
@ -59,7 +59,7 @@ UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
create table ${stats_db_name}.result_orcid as create table ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
from ( from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid SELECT substr(res.id, 4) as id, auth_pid.value as orcid

View File

@ -5,22 +5,22 @@
-- Licences related tables/views -- Licences related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
@ -34,11 +34,11 @@ SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
FROM ( FROM (
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
@ -46,17 +46,4 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS FOR COLUMNS;

View File

@ -6,22 +6,22 @@
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
select substr(r.id, 4) as id, inst.refereed.classname as refereed select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
select substr(r.id, 4) as id, inst.refereed.classname as refereed select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
select substr(r.id, 4) as id, inst.refereed.classname as refereed select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
select substr(r.id, 4) as id, inst.refereed.classname as refereed select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;

View File

@ -1,21 +1,21 @@
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
create table ${stats_db_name}.result_projectcount as create table ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count select r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id;
create table ${stats_db_name}.result_fundercount as create table ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count select r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id;
create table ${stats_db_name}.project_resultcount as create table ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
select p.id as pid, count(distinct r.id) as `count`, r.type as type select p.id as pid, count(distinct r.id) as `count`, r.type as type
from ${stats_db_name}.project p from ${stats_db_name}.project p

View File

@ -265,19 +265,19 @@ left outer join (
create table indi_org_openess stored as parquet as create table indi_org_openess stored as parquet as
WITH datasets_oa as ( WITH datasets_oa as (
SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa_new dg SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.dataset ds on dg.id=ds.id join openaire_prod_stats.dataset ds on dg.id=ds.id
WHERE dg.is_gold=1 WHERE dg.is_gold=1
group by ro.organization), group by ro.organization),
software_oa as ( software_oa as (
SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa_new dg SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.software ds on dg.id=ds.id join openaire_prod_stats.software ds on dg.id=ds.id
WHERE dg.is_gold=1 WHERE dg.is_gold=1
group by ro.organization), group by ro.organization),
pubs_oa as ( pubs_oa as (
SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa_new dg SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.publication ds on dg.id=ds.id join openaire_prod_stats.publication ds on dg.id=ds.id
where dg.is_gold=1 where dg.is_gold=1

View File

@ -3,20 +3,20 @@
---------------------------------------------------- ----------------------------------------------------
-- Peer reviewed: -- Peer reviewed:
create table ${stats_db_name}.result_peerreviewed as create table ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
-- Green OA: -- Green OA:
create table ${stats_db_name}.result_greenoa as create table ${stats_db_name}.result_greenoa STORED AS PARQUET as
select r.id, case when green.green_oa=1 then true else false end as green select r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
-- GOLD OA: -- GOLD OA:
create table ${stats_db_name}.result_gold as create table ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold select r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;

View File

@ -40,13 +40,13 @@ SELECT substr(p.id, 4) as id,
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_classifications AS CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, instancetype.classname as type SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_concepts AS CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
@ -55,7 +55,7 @@ from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_datasources as CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
@ -66,30 +66,30 @@ FROM (
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.publication_languages AS CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select substr(p.id, 4) as id, p.language.classname as language select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_oids AS CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_pids AS CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_topics as CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_citations AS CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation

View File

@ -22,7 +22,16 @@ create table TARGET.result stored as parquet as
'openorgs____::b84450f9864182c67b8611b5593f4250', 'openorgs____::b84450f9864182c67b8611b5593f4250',
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', 'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2') )) foo; 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2',
'openorgs____::d169c7407dd417152596908d48c11460',
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21',
'openorgs____::2fb1e47b4612688d9de9169d579939a7',
'openorgs____::759d59f05d77188faee99b7493b46805',
'openorgs____::cad284878801b9465fa51a95b1d779db',
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
'openorgs____::c0286313e36479eff8676dba9b724b40'
-- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot
) )) foo;
compute stats TARGET.result; compute stats TARGET.result;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
@ -52,7 +61,7 @@ compute stats TARGET.result_languages;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licenses; compute stats TARGET.result_licenses;
create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids; compute stats TARGET.result_oids;
@ -83,7 +92,7 @@ compute stats TARGET.result_topics;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1; drop view TARGET.foo1;
drop view TARGET.foo2; drop view TARGET.foo2;
compute stats TARGET.result_result; compute stats TARGET.result_result;
@ -157,13 +166,13 @@ create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid_oa_with_cc; compute stats TARGET.indi_pub_hybrid_oa_with_cc;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads; compute stats TARGET.indi_pub_downloads;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource; compute stats TARGET.indi_pub_downloads_datasource;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_year; compute stats TARGET.indi_pub_downloads_year;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource_year; compute stats TARGET.indi_pub_downloads_datasource_year;
--denorm --denorm

View File

@ -40,20 +40,20 @@ SELECT substr(d.id, 4) AS id,
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_citations AS CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_classifications AS CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_concepts AS CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
@ -62,7 +62,7 @@ from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_datasources AS CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
@ -74,24 +74,24 @@ FROM (
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.dataset_languages AS CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_oids AS CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_pids AS CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_topics AS CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject

View File

@ -40,20 +40,20 @@ SELECT substr(s.id, 4) as id,
from ${openaire_db_name}.software s from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_citations AS CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_classifications AS CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_concepts AS CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
@ -62,7 +62,7 @@ FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_datasources AS CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
@ -74,24 +74,24 @@ FROM (
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.software_languages AS CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
select substr(p.id, 4) AS id, p.language.classname AS language select substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_oids AS CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_pids AS CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_topics AS CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject

View File

@ -40,18 +40,18 @@ FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false;
-- Otherresearchproduct_citations -- Otherresearchproduct_citations
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
@ -59,7 +59,7 @@ SELECT substr(p.id, 4) as id, case
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
@ -68,22 +68,22 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -3,24 +3,24 @@
-- Project table/view and Project related tables/views -- Project table/view and Project related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
CREATE TABLE ${stats_db_name}.project_oids AS CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_organizations AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' WHERE r.reltype = 'projectOrganization'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_results AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' WHERE r.reltype = 'resultProject'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
create table ${stats_db_name}.project_classification as create table ${stats_db_name}.project_classification STORED AS PARQUET as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class lateral view explode(p.h2020classification) classifs as class
@ -76,7 +76,7 @@ SELECT substr(p.id, 4) AS id,
FROM ${openaire_db_name}.project p FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
create table ${stats_db_name}.funder as create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id, select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname xpath_string(fund, '//funder/shortname') as shortname

View File

@ -123,13 +123,13 @@ UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics; FROM ${stats_db_name}.otherresearchproduct_topics;
CREATE TABLE ${stats_db_name}.result_organization AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization' WHERE r.reltype = 'resultOrganization'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.result_projects AS CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result

View File

@ -80,24 +80,24 @@ UPDATE ${stats_db_name}.datasource_tmp
SET yearofvalidation=null SET yearofvalidation=null
WHERE yearofvalidation = '-1'; WHERE yearofvalidation = '-1';
CREATE TABLE ${stats_db_name}.datasource_languages AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.datasource_oids AS CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.datasource_organizations AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
create table if not exists ${stats_db_name}.datasource_sources AS create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;

View File

@ -3,7 +3,7 @@
-- Organization table/view and Organization related tables/views -- Organization table/view and Organization related tables/views
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
SELECT substr(o.id, 4) as id, SELECT substr(o.id, 4) as id,
o.legalname.value as name, o.legalname.value as name,
o.legalshortname.value as legalshortname, o.legalshortname.value as legalshortname,
@ -11,13 +11,10 @@ SELECT substr(o.id, 4) as id,
FROM ${openaire_db_name}.organization o FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE;
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources STORED AS PARQUET AS
SELECT organization AS id, id AS datasource SELECT organization AS id, id AS datasource
FROM ${stats_db_name}.datasource_organizations; FROM ${stats_db_name}.datasource_organizations;
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects STORED AS PARQUET AS
SELECT id AS project, organization as id SELECT id AS project, organization as id
FROM ${stats_db_name}.project_organizations; FROM ${stats_db_name}.project_organizations;
-- ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS FOR COLUMNS;