1
0
Fork 0

fixed formatting

This commit is contained in:
Antonis Lempesis 2021-02-14 03:14:24 +02:00
parent 2c4dcc90ba
commit 1c029b9fc0
11 changed files with 744 additions and 190 deletions

View File

@ -3,14 +3,37 @@
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref; CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country; SELECT *
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp; FROM ${external_stats_db_name}.fundref;
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; CREATE OR REPLACE VIEW ${stats_db_name}.country AS
CREATE OR REPLACE VIEW ${stats_db_name}.context AS SELECT * FROM ${external_stats_db_name}.context; SELECT *
CREATE OR REPLACE VIEW ${stats_db_name}.category AS SELECT * FROM ${external_stats_db_name}.category; FROM ${external_stats_db_name}.country;
CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_stats_db_name}.concept;
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT *
FROM ${external_stats_db_name}.countrygdp;
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT *
FROM ${external_stats_db_name}.roarmap;
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT *
FROM ${external_stats_db_name}.rndexpediture;
CREATE OR REPLACE VIEW ${stats_db_name}.context AS
SELECT *
FROM ${external_stats_db_name}.context;
CREATE OR REPLACE VIEW ${stats_db_name}.category AS
SELECT *
FROM ${external_stats_db_name}.category;
CREATE OR REPLACE VIEW ${stats_db_name}.concept AS
SELECT *
FROM ${external_stats_db_name}.concept;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
@ -18,7 +41,8 @@ CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_sta
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date; create table ${stats_db_name}.creation_date as
select date_format(current_date(), 'dd-MM-yyyy') as date;
ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,43 +5,114 @@
---------------------------------------------------------------- ----------------------------------------------------------------
--Datasource temporary table updates --Datasource temporary table updates
UPDATE ${stats_db_name}.datasource_tmp SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd WHERE d.id=rd.datasource); UPDATE ${stats_db_name}.datasource_tmp
SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource);
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication'); UPDATE ${stats_db_name}.project_tmp
SET haspubs='yes'
WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pr.result = r.id
AND r.type = 'publication');
DROP TABLE IF EXISTS ${stats_db_name}.project;
CREATE TABLE ${stats_db_name}.project stored as parquet as CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, SELECT p.id,
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, p.acronym,
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, p.title,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, p.funder,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, p.funding_lvl0,
p.callidentifier, p.code p.funding_lvl1,
FROM ${stats_db_name}.project_tmp p p.funding_lvl2,
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np p.ec39,
FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id p.type,
WHERE r.type='publication' p.startdate,
GROUP BY pr.id) AS prr1 on prr1.id = p.id p.enddate,
LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) AS daysForlastPub , count(distinct r.id) AS dp p.start_year,
FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r p.end_year,
WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 p.duration,
GROUP BY pp.id) AS prr2 CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
ON prr2.id = p.id; CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
-- Publication temporary table updates CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); p.callidentifier,
p.code
FROM ${stats_db_name}.project_tmp p
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
FROM ${stats_db_name}.project_results pr
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
WHERE r.type = 'publication'
GROUP BY pr.id) AS prr1 on prr1.id = p.id
LEFT JOIN (SELECT pp.id,
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
count(distinct r.id) AS dp
FROM ${stats_db_name}.project_tmp pp,
${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pp.id = pr.id
AND pr.result = r.id
AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2
ON prr2.id = p.id;
-- Dataset temporary table updates UPDATE ${stats_db_name}.publication_tmp
UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); SET delayed = 'yes'
WHERE publication_tmp.id IN (SELECT distinct r.id
FROM stats_wf_db_obs.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
-- Software temporary table updates UPDATE ${stats_db_name}.dataset_tmp
UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); SET delayed = 'yes'
WHERE dataset_tmp.id IN (SELECT distinct r.id
FROM stats_wf_db_obs.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
-- Oherresearchproduct temporary table updates UPDATE ${stats_db_name}.software_tmp
UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); SET delayed = 'yes'
WHERE software_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project; UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes'
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result,
result_projects.project AS project_results,
result.date as resultdate,
project.enddate as projectenddate,
result_projects.daysfromend AS daysfromend
FROM ${stats_db_name}.result_projects,
${stats_db_name}.result,
${stats_db_name}.project
WHERE result_projects.id = result.id
AND result.type = 'publication'
AND project.id = result_projects.project;
ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS;

View File

@ -1,21 +1,25 @@
------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT *
FROM ${stats_db_name}.datasource_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.datasource; CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp; SELECT *
FROM ${stats_db_name}.publication_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.publication; CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp; SELECT *
FROM ${stats_db_name}.dataset_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.dataset; CREATE TABLE ${stats_db_name}.software stored AS parquet AS
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp; SELECT *
FROM ${stats_db_name}.software_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.software; CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp; SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE ${stats_db_name}.project_tmp; DROP TABLE ${stats_db_name}.project_tmp;
DROP TABLE ${stats_db_name}.datasource_tmp; DROP TABLE ${stats_db_name}.datasource_tmp;
@ -29,13 +33,37 @@ DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
--------------------------------------------- ---------------------------------------------
-- Result -- Result
CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct; CREATE OR REPLACE VIEW ${stats_db_name}.result AS
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.publication
UNION ALL
SELECT *, bestlicence as access_mode
FROM ${stats_db_name}.software
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct;
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
-- To see with Antonis if the following is needed and where it should be placed -- To see with Antonis if the following is needed and where it should be placed
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country; CREATE TABLE ${stats_db_name}.numbers_country AS
SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications
FROM ${stats_db_name}.result r,
${stats_db_name}.result_datasources rd,
${stats_db_name}.datasource d,
${stats_db_name}.datasource_organizations dor,
${stats_db_name}.organization org
WHERE r.id = rd.id
AND rd.datasource = d.id
AND d.id = dor.id
AND dor.organization = org.id
AND r.type = 'publication'
AND r.bestlicence = 'Open Access'
GROUP BY org.country;
ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,41 +5,93 @@
-------------------------------------------------------------- --------------------------------------------------------------
-- Publication temporary table -- Publication temporary table
CREATE TABLE ${stats_db_name}.publication_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); CREATE TABLE ${stats_db_name}.publication_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , INSERT INTO ${stats_db_name}.publication_tmp
p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence, SELECT substr(p.id, 4) as id,
p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source, p.title[0].value as title,
case when size(p.description) > 0 then true else false end as abstract, p.publisher.value as publisher,
'publication' as type p.journal.name as journal,
p.dateofacceptance.value as date,
date_format(p.dateofacceptance.value, 'yyyy') as year,
p.bestaccessright.classname as bestlicence,
p.embargoenddate.value as embargo_end_date,
false as delayed,
size(p.author) as authors,
concat_ws('\u003B', p.source.value) as source,
case when size(p.description) > 0 then true else false end as abstract,
'publication' as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference=false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_classifications AS
SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_concepts AS
SELECT substr(p.id, 4) as id, contexts.context.id as concept
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_datasources as CREATE TABLE ${stats_db_name}.publication_datasources as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference=false ) p where p.datainfo.deletedbyinference = false) p
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false ) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_languages AS
select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.publication_topics as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
-- Publication_citations CREATE TABLE ${stats_db_name}.publication_citations AS
CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and p.datainfo.deletedbyinference=false; SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false;
ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,34 +5,94 @@
------------------------------------------------------ ------------------------------------------------------
-- Dataset temporary table supporting updates -- Dataset temporary table supporting updates
CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true'); CREATE TABLE ${stats_db_name}.dataset_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal, INSERT INTO ${stats_db_name}.dataset_tmp
d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') AS year, d.bestaccessright.classname AS bestlicence, SELECT substr(d.id, 4) AS id,
d.embargoenddate.value AS embargo_end_date, false AS delayed, size(d.author) AS authors , concat_ws('\u003B',d.source.value) AS source, d.title[0].value AS title,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, d.publisher.value AS publisher,
'dataset' AS type cast(null AS string) AS journal,
d.dateofacceptance.value as date,
date_format(d.dateofacceptance.value, 'yyyy') AS year,
d.bestaccessright.classname AS bestlicence,
d.embargoenddate.value AS embargo_end_date,
false AS delayed,
size(d.author) AS authors,
concat_ws('\u003B', d.source.value) AS source,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
'dataset' AS type
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference=FALSE; WHERE d.datainfo.deletedbyinference = FALSE;
-- Dataset_citations CREATE TABLE ${stats_db_name}.dataset_citations AS
CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and d.datainfo.deletedbyinference=false; SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_concepts AS
SELECT substr(p.id, 4) as id, contexts.context.id as concept
from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource CREATE TABLE ${stats_db_name}.dataset_datasources AS
FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance) instances AS instance
where p.datainfo.deletedbyinference = false) p
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_languages AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,34 +5,94 @@
-------------------------------------------------------- --------------------------------------------------------
-- Software temporary table supporting updates -- Software temporary table supporting updates
CREATE TABLE ${stats_db_name}.software_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); CREATE TABLE ${stats_db_name}.software_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
)
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal, INSERT INTO ${stats_db_name}.software_tmp
s.dateofacceptance.value AS DATE, date_format(s.dateofacceptance.value,'yyyy') AS YEAR, s.bestaccessright.classname AS bestlicence, SELECT substr(s.id, 4) as id,
s.embargoenddate.value AS embargo_end_date, FALSE AS delayed, SIZE(s.author) AS authors , concat_ws('\u003B',s.source.value) AS source, s.title[0].value AS title,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, s.publisher.value AS publisher,
'software' as type CAST(NULL AS string) AS journal,
s.dateofacceptance.value AS DATE,
date_format(s.dateofacceptance.value, 'yyyy') AS YEAR,
s.bestaccessright.classname AS bestlicence,
s.embargoenddate.value AS embargo_end_date,
FALSE AS delayed,
SIZE(s.author) AS authors,
concat_ws('\u003B', s.source.value) AS source,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'software' as type
from ${openaire_db_name}.software s from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference=false; where s.datainfo.deletedbyinference = false;
-- Software_citations CREATE TABLE ${stats_db_name}.software_citations AS
CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and s.datainfo.deletedbyinference=false; SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_concepts AS
SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource CREATE TABLE ${stats_db_name}.software_datasources AS
FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; FROM (
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance) instances AS instance
where p.datainfo.deletedbyinference = false) p
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_languages AS
select substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,32 +5,85 @@
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Otherresearchproduct temporary table supporting updates -- Otherresearchproduct temporary table supporting updates
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal, INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
o.dateofacceptance.value AS DATE, date_format(o.dateofacceptance.value,'yyyy') AS year, o.bestaccessright.classname AS bestlicence, SELECT substr(o.id, 4) AS id,
o.embargoenddate.value as embargo_end_date, FALSE AS delayed, SIZE(o.author) AS authors , concat_ws('\u003B',o.source.value) AS source, o.title[0].value AS title,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, o.publisher.value AS publisher,
'other' AS type CAST(NULL AS string) AS journal,
o.dateofacceptance.value AS DATE,
date_format(o.dateofacceptance.value, 'yyyy') AS year,
o.bestaccessright.classname AS bestlicence,
o.embargoenddate.value as embargo_end_date,
FALSE AS delayed,
SIZE(o.author) AS authors,
concat_ws('\u003B', o.source.value) AS source,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'other' AS type
FROM ${openaire_db_name}.otherresearchproduct o FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference=FALSE; WHERE o.datainfo.deletedbyinference = FALSE;
-- Otherresearchproduct_citations -- Otherresearchproduct_citations
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and o.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference = false) p
LEFT OUTER JOIN(SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS;

View File

@ -1,28 +1,75 @@
-- noinspection SqlNoDataSourceInspectionForFile
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Project table/view and Project related tables/views -- Project table/view and Project related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Project_oids Table CREATE TABLE ${stats_db_name}.project_oids AS
CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids;
CREATE TABLE ${stats_db_name}.project_organizations AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization'
and r.datainfo.deletedbyinference = false;
-- Project_organizations Table CREATE TABLE ${stats_db_name}.project_results AS
CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization' and r.datainfo.deletedbyinference=false; SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject'
and r.datainfo.deletedbyinference = false;
-- Project_results Table CREATE TABLE ${stats_db_name}.project_tmp
CREATE TABLE ${stats_db_name}.project_results AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result FROM ${openaire_db_name}.relation r WHERE r.reltype='resultProject' and r.datainfo.deletedbyinference=false; (
id STRING,
acronym STRING,
title STRING,
funder STRING,
funding_lvl0 STRING,
funding_lvl1 STRING,
funding_lvl2 STRING,
ec39 STRING,
type STRING,
startdate STRING,
enddate STRING,
start_year INT,
end_year INT,
duration INT,
haspubs STRING,
numpubs INT,
daysforlastpub INT,
delayedpubs INT,
callidentifier STRING,
code STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
-- Project table INSERT INTO ${stats_db_name}.project_tmp
---------------- SELECT substr(p.id, 4) AS id,
-- Creating and populating temporary Project table p.acronym.value AS acronym,
CREATE TABLE ${stats_db_name}.project_tmp (id STRING, acronym STRING, title STRING, funder STRING, funding_lvl0 STRING, funding_lvl1 STRING, funding_lvl2 STRING, ec39 STRING, type STRING, startdate STRING, enddate STRING, start_year INT, end_year INT, duration INT, haspubs STRING, numpubs INT, daysforlastpub INT, delayedpubs INT, callidentifier STRING, code STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); p.title.value AS title,
xpath_string(p.fundingtree[0].value, '//funder/name') AS funder,
INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, p.acronym.value AS acronym, p.title.value AS title, xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, p.ecsc39.value AS ec39, p.contracttype.classname AS type, p.startdate.value AS startdate, p.enddate.value AS enddate, year(p.startdate.value) AS start_year, year(p.enddate.value) AS end_year, CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, 'no' AS haspubs, 0 AS numpubs, 0 AS daysforlastpub, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference=false; xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0,
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1,
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2,
p.ecsc39.value AS ec39,
p.contracttype.classname AS type,
p.startdate.value AS startdate,
p.enddate.value AS enddate,
year(p.startdate.value) AS start_year,
year(p.enddate.value) AS end_year,
CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration,
'no' AS haspubs,
0 AS numpubs,
0 AS daysforlastpub,
0 AS delayedpubs,
p.callidentifier.value AS callidentifier,
p.code.value AS code
FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false;
create table ${stats_db_name}.funder as create table ${stats_db_name}.funder as
select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund
ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS;

View File

@ -5,28 +5,135 @@
---------------------------------------------------- ----------------------------------------------------
-- Views on temporary tables that should be re-created in the end -- Views on temporary tables that should be re-created in the end
CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.software_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.dataset_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct_tmp; CREATE OR REPLACE VIEW ${stats_db_name}.result as
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.publication_tmp
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.software_tmp
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset_tmp
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct_tmp;
-- Views on final tables -- Views on final tables
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources UNION ALL SELECT * FROM ${stats_db_name}.software_datasources UNION ALL SELECT * FROM ${stats_db_name}.dataset_datasources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
SELECT *
FROM ${stats_db_name}.publication_datasources
UNION ALL
SELECT *
FROM ${stats_db_name}.software_datasources
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_datasources
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_datasources;
CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS SELECT * FROM ${stats_db_name}.publication_citations UNION ALL SELECT * FROM ${stats_db_name}.software_citations UNION ALL SELECT * FROM ${stats_db_name}.dataset_citations UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS
SELECT *
FROM ${stats_db_name}.publication_citations
UNION ALL
SELECT *
FROM ${stats_db_name}.software_citations
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_citations
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_citations;
CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications UNION ALL SELECT * FROM ${stats_db_name}.software_classifications UNION ALL SELECT * FROM ${stats_db_name}.dataset_classifications UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS
SELECT *
FROM ${stats_db_name}.publication_classifications
UNION ALL
SELECT *
FROM ${stats_db_name}.software_classifications
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_classifications
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_classifications;
CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts UNION ALL SELECT * FROM ${stats_db_name}.software_concepts UNION ALL SELECT * FROM ${stats_db_name}.dataset_concepts UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS
SELECT *
FROM ${stats_db_name}.publication_concepts
UNION ALL
SELECT *
FROM ${stats_db_name}.software_concepts
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_concepts
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_concepts;
CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS SELECT * FROM ${stats_db_name}.publication_languages UNION ALL SELECT * FROM ${stats_db_name}.software_languages UNION ALL SELECT * FROM ${stats_db_name}.dataset_languages UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS
SELECT *
FROM ${stats_db_name}.publication_languages
UNION ALL
SELECT *
FROM ${stats_db_name}.software_languages
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_languages
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_languages;
CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS SELECT * FROM ${stats_db_name}.publication_oids UNION ALL SELECT * FROM ${stats_db_name}.software_oids UNION ALL SELECT * FROM ${stats_db_name}.dataset_oids UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS
SELECT *
FROM ${stats_db_name}.publication_oids
UNION ALL
SELECT *
FROM ${stats_db_name}.software_oids
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_oids
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_oids;
CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * FROM ${stats_db_name}.publication_pids UNION ALL SELECT * FROM ${stats_db_name}.software_pids UNION ALL SELECT * FROM ${stats_db_name}.dataset_pids UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS
SELECT *
FROM ${stats_db_name}.publication_pids
UNION ALL
SELECT *
FROM ${stats_db_name}.software_pids
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_pids
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_pids;
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * FROM ${stats_db_name}.publication_topics UNION ALL SELECT * FROM ${stats_db_name}.software_topics UNION ALL SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS
SELECT *
FROM ${stats_db_name}.publication_topics
UNION ALL
SELECT *
FROM ${stats_db_name}.software_topics
UNION ALL
SELECT *
FROM ${stats_db_name}.dataset_topics
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics;
CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization' and r.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.result_organization AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization'
and r.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id=pr.result JOIN ${stats_db_name}.project_tmp p ON p.id=pr.id; CREATE TABLE ${stats_db_name}.result_projects AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend
FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS;

View File

@ -5,55 +5,99 @@
-- Datasource table/view and Datasource related tables/views -- Datasource table/view and Datasource related tables/views
------------------------------------------------------------ ------------------------------------------------------------
------------------------------------------------------------ ------------------------------------------------------------
CREATE TABLE ${stats_db_name}.datasource_tmp
-- Datasource table creation & update (
------------------------------------- `id` string,
-- Creating and populating temporary datasource table `name` STRING,
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp; `type` STRING,
CREATE TABLE ${stats_db_name}.datasource_tmp(`id` string, `name` STRING, `type` STRING, `dateofvalidation` STRING, `yearofvalidation` string, `harvested` BOOLEAN, `piwik_id` INT, `latitude` STRING, `longitude`STRING, `websiteurl` STRING, `compatibility` STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); `dateofvalidation` STRING,
`yearofvalidation` string,
`harvested` BOOLEAN,
`piwik_id` INT,
`latitude` STRING,
`longitude` STRING,
`websiteurl` STRING,
`compatibility` STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
-- Insert statement that takes into account the piwik_id of the openAIRE graph -- Insert statement that takes into account the piwik_id of the openAIRE graph
INSERT INTO ${stats_db_name}.datasource_tmp INSERT INTO ${stats_db_name}.datasource_tmp
SELECT substr(d1.id, 4) AS id, officialname.value AS name, SELECT substr(d1.id, 4) AS id,
datasourcetype.classname AS type, dateofvalidation.value AS dateofvalidation, date_format(d1.dateofvalidation.value,'yyyy') AS yearofvalidation, officialname.value AS name,
FALSE AS harvested, datasourcetype.classname AS type,
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id, dateofvalidation.value AS dateofvalidation,
d1.latitude.value AS latitude, d1.longitude.value AS longitude, date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation,
d1.websiteurl.value AS websiteurl, d1.openairecompatibility.classid AS compatibility FALSE AS harvested,
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
d1.latitude.value AS latitude,
d1.longitude.value AS longitude,
d1.websiteurl.value AS websiteurl,
d1.openairecompatibility.classid AS compatibility
FROM ${openaire_db_name}.datasource d1 FROM ${openaire_db_name}.datasource d1
LEFT OUTER JOIN LEFT OUTER JOIN
(SELECT id, split(originalidd, '\\:')[1] as piwik_id (SELECT id, split(originalidd, '\\:')[1] as piwik_id
FROM ${openaire_db_name}.datasource FROM ${openaire_db_name}.datasource
LATERAL VIEW EXPLODE(originalid) temp AS originalidd LATERAL VIEW EXPLODE(originalid) temp AS originalidd
WHERE originalidd like "piwik:%") AS d2 WHERE originalidd like "piwik:%") AS d2
ON d1.id = d2.id ON d1.id = d2.id
WHERE d1.datainfo.deletedbyinference=FALSE; WHERE d1.datainfo.deletedbyinference = FALSE;
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert -- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual(dummy CHAR(1)); CREATE TABLE ${stats_db_name}.dual
INSERT INTO ${stats_db_name}.dual VALUES('X'); (
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`) dummy CHAR(1)
SELECT 'other', 'Other', 'Repository', NULL, NULL, false, 0, NULL, NULL, NULL, 'unknown' FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name='Unknown Repository'); );
INSERT INTO ${stats_db_name}.dual
VALUES ('X');
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`)
SELECT 'other',
'Other',
'Repository',
NULL,
NULL,
false,
0,
NULL,
NULL,
NULL,
'unknown'
FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual; DROP TABLE ${stats_db_name}.dual;
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name='Unknown Repository'; UPDATE ${stats_db_name}.datasource_tmp
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation='-1'; SET name='Other'
WHERE name = 'Unknown Repository';
UPDATE ${stats_db_name}.datasource_tmp
SET yearofvalidation=null
WHERE yearofvalidation = '-1';
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages; CREATE TABLE ${stats_db_name}.datasource_languages AS
CREATE TABLE ${stats_db_name}.datasource_languages AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages; SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids; CREATE TABLE ${stats_db_name}.datasource_oids AS
CREATE TABLE ${stats_db_name}.datasource_oids AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations; CREATE TABLE ${stats_db_name}.datasource_organizations AS
CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization' and r.datainfo.deletedbyinference=false; SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization'
and r.datainfo.deletedbyinference = false;
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
create table if not exists ${stats_db_name}.datasource_sources AS select substr(d.id,4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference=false; create table if not exists ${stats_db_name}.datasource_sources AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources;
ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS;

View File

@ -3,13 +3,21 @@
-- Organization table/view and Organization related tables/views -- Organization table/view and Organization related tables/views
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.organization; CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country SELECT substr(o.id, 4) as id,
FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALSE; o.legalname.value as name,
o.legalshortname.value as legalshortname,
o.country.classid as country
FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE;
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations; CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
SELECT organization AS id, id AS datasource
FROM ${stats_db_name}.datasource_organizations;
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations; CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS
SELECT id AS project, organization as id
FROM ${stats_db_name}.project_organizations;
ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS; ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS;
ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS FOR COLUMNS;