forked from D-Net/dnet-hadoop
Improve performance and efficiency by rewriting the creation process of "publication", "project", "dataset", "datasource", "software", "otherresearchproduct" and "result" tables, to be performed in a single query, for each one.
This commit is contained in:
parent
fe2275a9b0
commit
54e11b6a43
|
@ -4,108 +4,6 @@
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
|
|
||||||
--Datasource temporary table updates
|
|
||||||
UPDATE ${stats_db_name}.datasource_tmp
|
|
||||||
SET harvested='true'
|
|
||||||
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
|
|
||||||
FROM ${stats_db_name}.datasource_tmp d,
|
|
||||||
${stats_db_name}.result_datasources rd
|
|
||||||
WHERE d.id = rd.datasource); -- /*EOS*/
|
|
||||||
|
|
||||||
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
|
||||||
UPDATE ${stats_db_name}.project_tmp
|
|
||||||
SET haspubs='yes'
|
|
||||||
WHERE project_tmp.id IN (SELECT pr.id
|
|
||||||
FROM ${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.result r
|
|
||||||
WHERE pr.result = r.id
|
|
||||||
AND r.type = 'publication'); -- /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.project purge; -- /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
|
||||||
SELECT p.id,
|
|
||||||
p.acronym,
|
|
||||||
p.title,
|
|
||||||
p.funder,
|
|
||||||
p.funding_lvl0,
|
|
||||||
p.funding_lvl1,
|
|
||||||
p.funding_lvl2,
|
|
||||||
p.ec39,
|
|
||||||
p.type,
|
|
||||||
p.startdate,
|
|
||||||
p.enddate,
|
|
||||||
p.start_year,
|
|
||||||
p.end_year,
|
|
||||||
p.duration,
|
|
||||||
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
|
|
||||||
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
|
|
||||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
|
|
||||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
|
||||||
p.callidentifier,
|
|
||||||
p.code,
|
|
||||||
p.totalcost,
|
|
||||||
p.fundedamount,
|
|
||||||
p.currency
|
|
||||||
FROM ${stats_db_name}.project_tmp p
|
|
||||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
|
||||||
FROM ${stats_db_name}.project_results pr
|
|
||||||
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
|
|
||||||
WHERE r.type = 'publication'
|
|
||||||
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
|
||||||
LEFT JOIN (SELECT pp.id,
|
|
||||||
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
|
|
||||||
count(distinct r.id) AS dp
|
|
||||||
FROM ${stats_db_name}.project_tmp pp,
|
|
||||||
${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.result r
|
|
||||||
WHERE pp.id = pr.id
|
|
||||||
AND pr.result = r.id
|
|
||||||
AND r.type = 'publication'
|
|
||||||
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
|
||||||
GROUP BY pp.id) AS prr2
|
|
||||||
ON prr2.id = p.id; -- /*EOS*/
|
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.publication_tmp
|
|
||||||
SET delayed = 'yes'
|
|
||||||
WHERE publication_tmp.id IN (SELECT distinct r.id
|
|
||||||
FROM ${stats_db_name}.result r,
|
|
||||||
${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.project_tmp p
|
|
||||||
WHERE r.id = pr.result
|
|
||||||
AND pr.id = p.id
|
|
||||||
AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
|
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.dataset_tmp
|
|
||||||
SET delayed = 'yes'
|
|
||||||
WHERE dataset_tmp.id IN (SELECT distinct r.id
|
|
||||||
FROM ${stats_db_name}.result r,
|
|
||||||
${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.project_tmp p
|
|
||||||
WHERE r.id = pr.result
|
|
||||||
AND pr.id = p.id
|
|
||||||
AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
|
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.software_tmp
|
|
||||||
SET delayed = 'yes'
|
|
||||||
WHERE software_tmp.id IN (SELECT distinct r.id
|
|
||||||
FROM ${stats_db_name}.result r,
|
|
||||||
${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.project_tmp p
|
|
||||||
WHERE r.id = pr.result
|
|
||||||
AND pr.id = p.id
|
|
||||||
AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
|
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.otherresearchproduct_tmp
|
|
||||||
SET delayed = 'yes'
|
|
||||||
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
|
|
||||||
FROM ${stats_db_name}.result r,
|
|
||||||
${stats_db_name}.project_results pr,
|
|
||||||
${stats_db_name}.project_tmp p
|
|
||||||
WHERE r.id = pr.result
|
|
||||||
AND pr.id = p.id
|
|
||||||
AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
|
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
||||||
SELECT result_projects.id AS result,
|
SELECT result_projects.id AS result,
|
||||||
result_projects.project AS project_results,
|
result_projects.project AS project_results,
|
||||||
|
|
|
@ -1,42 +1,4 @@
|
||||||
------------------------------------------------------------------------------------------------------
|
set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
|
||||||
------------------------------------------------------------------------------------------------------
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
|
|
||||||
SELECT *
|
|
||||||
FROM ${stats_db_name}.datasource_tmp; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
|
|
||||||
SELECT *
|
|
||||||
FROM ${stats_db_name}.publication_tmp; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
|
|
||||||
SELECT *
|
|
||||||
FROM ${stats_db_name}.dataset_tmp; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
|
|
||||||
SELECT *
|
|
||||||
FROM ${stats_db_name}.software_tmp; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
|
|
||||||
SELECT *
|
|
||||||
FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
|
|
||||||
|
|
||||||
----------------------------------------------
|
----------------------------------------------
|
||||||
-- Re-creating views from final parquet tables
|
-- Re-creating views from final parquet tables
|
||||||
|
|
|
@ -1,58 +1,26 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
|
|
||||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
|
||||||
-- peer reviewed)
|
-- peer reviewed)
|
||||||
drop table if exists ${stats_db_name}.result_tmp; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.result_tmp (
|
|
||||||
id STRING,
|
|
||||||
title STRING,
|
|
||||||
publisher STRING,
|
|
||||||
journal STRING,
|
|
||||||
`date` STRING,
|
|
||||||
`year` INT,
|
|
||||||
bestlicence STRING,
|
|
||||||
access_mode STRING,
|
|
||||||
embargo_end_date STRING,
|
|
||||||
delayed BOOLEAN,
|
|
||||||
authors INT,
|
|
||||||
source STRING,
|
|
||||||
abstract BOOLEAN,
|
|
||||||
type STRING ,
|
|
||||||
peer_reviewed BOOLEAN,
|
|
||||||
green BOOLEAN,
|
|
||||||
gold BOOLEAN)
|
|
||||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.publication r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.dataset r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.software r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
|
||||||
|
|
||||||
insert into ${stats_db_name}.result_tmp
|
|
||||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
|
||||||
FROM ${stats_db_name}.otherresearchproduct r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.result; /*EOS*/
|
|
||||||
drop view if exists ${stats_db_name}.result; /*EOS*/
|
drop view if exists ${stats_db_name}.result; /*EOS*/
|
||||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/
|
drop table if exists ${stats_db_name}.result; /*EOS*/
|
||||||
drop table ${stats_db_name}.result_tmp; /*EOS*/
|
|
||||||
|
CREATE TABLE ${stats_db_name}.result stored as parquet as
|
||||||
|
SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM (
|
||||||
|
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||||
|
FROM ${stats_db_name}.publication)
|
||||||
|
UNION ALL
|
||||||
|
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||||
|
FROM ${stats_db_name}.dataset)
|
||||||
|
UNION ALL
|
||||||
|
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||||
|
FROM ${stats_db_name}.software)
|
||||||
|
UNION ALL
|
||||||
|
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||||
|
FROM ${stats_db_name}.otherresearchproduct)
|
||||||
|
) r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
||||||
|
|
|
@ -7,41 +7,41 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
|
|
||||||
-- Publication temporary table
|
-- Publication temporary table
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
|
||||||
CREATE TABLE ${stats_db_name}.publication_tmp
|
|
||||||
(
|
|
||||||
id STRING,
|
|
||||||
title STRING,
|
|
||||||
publisher STRING,
|
|
||||||
journal STRING,
|
|
||||||
date STRING,
|
|
||||||
year STRING,
|
|
||||||
bestlicence STRING,
|
|
||||||
embargo_end_date STRING,
|
|
||||||
delayed BOOLEAN,
|
|
||||||
authors INT,
|
|
||||||
source STRING,
|
|
||||||
abstract BOOLEAN,
|
|
||||||
type STRING
|
|
||||||
)
|
|
||||||
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/
|
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.publication_tmp
|
CREATE TABLE ${stats_db_name}.publication stored as parquet as
|
||||||
SELECT substr(p.id, 4) as id,
|
with pub_pr as (
|
||||||
p.title[0].value as title,
|
select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||||
p.publisher.value as publisher,
|
from ${openaire_db_name}.publication pub
|
||||||
p.journal.name as journal,
|
join ${openaire_db_name}.relation rel
|
||||||
p.dateofacceptance.value as date,
|
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
|
||||||
date_format(p.dateofacceptance.value, 'yyyy') as year,
|
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||||
p.bestaccessright.classname as bestlicence,
|
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||||
p.embargoenddate.value as embargo_end_date,
|
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
|
||||||
false as delayed,
|
),
|
||||||
size(p.author) as authors,
|
pub_delayed as (
|
||||||
concat_ws('\u003B', p.source.value) as source,
|
select pub_id, max(delayed) as delayed
|
||||||
case when size(p.description) > 0 then true else false end as abstract,
|
from pub_pr
|
||||||
|
group by pub_id
|
||||||
|
)
|
||||||
|
select /*+ COALESCE(100) */
|
||||||
|
substr(pub.id, 4) as id,
|
||||||
|
pub.title[0].value as title,
|
||||||
|
pub.publisher.value as publisher,
|
||||||
|
pub.journal.name as journal,
|
||||||
|
pub.dateofacceptance.value as date,
|
||||||
|
date_format(pub.dateofacceptance.value, 'yyyy') as year,
|
||||||
|
pub.bestaccessright.classname as bestlicence,
|
||||||
|
pub.embargoenddate.value as embargo_end_date,
|
||||||
|
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
|
||||||
|
size(pub.author) as authors,
|
||||||
|
concat_ws('\u003B', pub.source.value) as source,
|
||||||
|
case when size(pub.description) > 0 then true else false end as abstract,
|
||||||
'publication' as type
|
'publication' as type
|
||||||
from ${openaire_db_name}.publication p
|
from ${openaire_db_name}.publication pub
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
left outer join pub_delayed on pub.id=pub_delayed.pub_id
|
||||||
|
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
|
||||||
|
|
||||||
|
|
|
@ -5,42 +5,41 @@
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
|
|
||||||
-- Dataset temporary table supporting updates
|
-- Dataset temporary table supporting updates
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.dataset_tmp
|
CREATE TABLE ${stats_db_name}.dataset stored as parquet as
|
||||||
(
|
with datast_pr as (
|
||||||
id STRING,
|
select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||||
title STRING,
|
from ${openaire_db_name}.dataset datast
|
||||||
publisher STRING,
|
join ${openaire_db_name}.relation rel
|
||||||
journal STRING,
|
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id
|
||||||
date STRING,
|
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||||
year STRING,
|
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||||
bestlicence STRING,
|
where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false
|
||||||
embargo_end_date STRING,
|
),
|
||||||
delayed BOOLEAN,
|
datast_delayed as (
|
||||||
authors INT,
|
select datast_id, max(delayed) as delayed
|
||||||
source STRING,
|
from datast_pr
|
||||||
abstract BOOLEAN,
|
group by datast_id
|
||||||
type STRING
|
|
||||||
)
|
)
|
||||||
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
|
select /*+ COALESCE(100) */
|
||||||
|
substr(datast.id, 4) as id,
|
||||||
|
datast.title[0].value as title,
|
||||||
|
datast.publisher.value as publisher,
|
||||||
|
cast(null as string) as journal,
|
||||||
|
datast.dateofacceptance.value as date,
|
||||||
|
date_format(datast.dateofacceptance.value, 'yyyy') as year,
|
||||||
|
datast.bestaccessright.classname as bestlicence,
|
||||||
|
datast.embargoenddate.value as embargo_end_date,
|
||||||
|
coalesce(datast_delayed.delayed, false) as delayed, -- It's delayed, when the dataset was published after the end of the project.
|
||||||
|
size(datast.author) as authors,
|
||||||
|
concat_ws('\u003B', datast.source.value) as source,
|
||||||
|
case when size(datast.description) > 0 then true else false end as abstract,
|
||||||
|
'dataset' as type
|
||||||
|
from ${openaire_db_name}.dataset datast
|
||||||
|
left outer join datast_delayed on datast.id=datast_delayed.datast_id
|
||||||
|
where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.dataset_tmp
|
|
||||||
SELECT substr(d.id, 4) AS id,
|
|
||||||
d.title[0].value AS title,
|
|
||||||
d.publisher.value AS publisher,
|
|
||||||
cast(null AS string) AS journal,
|
|
||||||
d.dateofacceptance.value as date,
|
|
||||||
date_format(d.dateofacceptance.value, 'yyyy') AS year,
|
|
||||||
d.bestaccessright.classname AS bestlicence,
|
|
||||||
d.embargoenddate.value AS embargo_end_date,
|
|
||||||
false AS delayed,
|
|
||||||
size(d.author) AS authors,
|
|
||||||
concat_ws('\u003B', d.source.value) AS source,
|
|
||||||
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
|
|
||||||
'dataset' AS type
|
|
||||||
FROM ${openaire_db_name}.dataset d
|
|
||||||
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
|
||||||
|
|
||||||
|
|
|
@ -5,41 +5,41 @@
|
||||||
--------------------------------------------------------
|
--------------------------------------------------------
|
||||||
|
|
||||||
-- Software temporary table supporting updates
|
-- Software temporary table supporting updates
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
|
||||||
CREATE TABLE ${stats_db_name}.software_tmp
|
|
||||||
(
|
|
||||||
id STRING,
|
|
||||||
title STRING,
|
|
||||||
publisher STRING,
|
|
||||||
journal STRING,
|
|
||||||
date STRING,
|
|
||||||
year STRING,
|
|
||||||
bestlicence STRING,
|
|
||||||
embargo_end_date STRING,
|
|
||||||
delayed BOOLEAN,
|
|
||||||
authors INT,
|
|
||||||
source STRING,
|
|
||||||
abstract BOOLEAN,
|
|
||||||
type STRING
|
|
||||||
)
|
|
||||||
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
|
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.software_tmp
|
CREATE TABLE ${stats_db_name}.software stored as parquet as
|
||||||
SELECT substr(s.id, 4) as id,
|
with soft_pr as (
|
||||||
s.title[0].value AS title,
|
select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||||
s.publisher.value AS publisher,
|
from ${openaire_db_name}.software soft
|
||||||
CAST(NULL AS string) AS journal,
|
join ${openaire_db_name}.relation rel
|
||||||
s.dateofacceptance.value AS DATE,
|
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id
|
||||||
date_format(s.dateofacceptance.value, 'yyyy') AS YEAR,
|
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||||
s.bestaccessright.classname AS bestlicence,
|
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||||
s.embargoenddate.value AS embargo_end_date,
|
where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false
|
||||||
FALSE AS delayed,
|
),
|
||||||
SIZE(s.author) AS authors,
|
soft_delayed as (
|
||||||
concat_ws('\u003B', s.source.value) AS source,
|
select soft_id, max(delayed) as delayed
|
||||||
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
from soft_pr
|
||||||
|
group by soft_id
|
||||||
|
)
|
||||||
|
select /*+ COALESCE(100) */
|
||||||
|
substr(soft.id, 4) as id,
|
||||||
|
soft.title[0].value as title,
|
||||||
|
soft.publisher.value as publisher,
|
||||||
|
cast(null as string) as journal,
|
||||||
|
soft.dateofacceptance.value as date,
|
||||||
|
date_format(soft.dateofacceptance.value, 'yyyy') as year,
|
||||||
|
soft.bestaccessright.classname as bestlicence,
|
||||||
|
soft.embargoenddate.value as embargo_end_date,
|
||||||
|
coalesce(soft_delayed.delayed, false) as delayed, -- It's delayed, when the software was published after the end of the project.
|
||||||
|
size(soft.author) as authors,
|
||||||
|
concat_ws('\u003B', soft.source.value) as source,
|
||||||
|
case when size(soft.description) > 0 then true else false end as abstract,
|
||||||
'software' as type
|
'software' as type
|
||||||
from ${openaire_db_name}.software s
|
from ${openaire_db_name}.software soft
|
||||||
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
|
left outer join soft_delayed on soft.id=soft_delayed.soft_id
|
||||||
|
where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
|
||||||
|
|
||||||
|
|
|
@ -5,41 +5,41 @@
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
-- Otherresearchproduct temporary table supporting updates
|
-- Otherresearchproduct temporary table supporting updates
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
|
CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as
|
||||||
(
|
with other_pr as (
|
||||||
id STRING,
|
select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||||
title STRING,
|
from ${openaire_db_name}.otherresearchproduct other
|
||||||
publisher STRING,
|
join ${openaire_db_name}.relation rel
|
||||||
journal STRING,
|
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id
|
||||||
date STRING,
|
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||||
year STRING,
|
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||||
bestlicence STRING,
|
where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false
|
||||||
embargo_end_date STRING,
|
),
|
||||||
delayed BOOLEAN,
|
other_delayed as (
|
||||||
authors INT,
|
select other_id, max(delayed) as delayed
|
||||||
source STRING,
|
from other_pr
|
||||||
abstract BOOLEAN,
|
group by other_id
|
||||||
type STRING
|
)
|
||||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
|
select /*+ COALESCE(100) */
|
||||||
|
substr(other.id, 4) as id,
|
||||||
|
other.title[0].value as title,
|
||||||
|
other.publisher.value as publisher,
|
||||||
|
cast(null as string) as journal,
|
||||||
|
other.dateofacceptance.value as date,
|
||||||
|
date_format(other.dateofacceptance.value, 'yyyy') as year,
|
||||||
|
other.bestaccessright.classname as bestlicence,
|
||||||
|
other.embargoenddate.value as embargo_end_date,
|
||||||
|
false as delayed,
|
||||||
|
size(other.author) as authors,
|
||||||
|
concat_ws('\u003B', other.source.value) as source,
|
||||||
|
case when size(other.description) > 0 then true else false end as abstract,
|
||||||
|
'other' as type
|
||||||
|
from ${openaire_db_name}.otherresearchproduct other
|
||||||
|
left outer join other_delayed on other.id=other_delayed.other_id
|
||||||
|
where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
|
|
||||||
SELECT substr(o.id, 4) AS id,
|
|
||||||
o.title[0].value AS title,
|
|
||||||
o.publisher.value AS publisher,
|
|
||||||
CAST(NULL AS string) AS journal,
|
|
||||||
o.dateofacceptance.value AS DATE,
|
|
||||||
date_format(o.dateofacceptance.value, 'yyyy') AS year,
|
|
||||||
o.bestaccessright.classname AS bestlicence,
|
|
||||||
o.embargoenddate.value as embargo_end_date,
|
|
||||||
FALSE AS delayed,
|
|
||||||
SIZE(o.author) AS authors,
|
|
||||||
concat_ws('\u003B', o.source.value) AS source,
|
|
||||||
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
|
||||||
'other' AS type
|
|
||||||
FROM ${openaire_db_name}.otherresearchproduct o
|
|
||||||
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/
|
|
||||||
|
|
||||||
-- Otherresearchproduct_citations
|
-- Otherresearchproduct_citations
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
|
||||||
|
|
|
@ -34,61 +34,69 @@ from ${openaire_db_name}.project p
|
||||||
lateral view explode(p.h2020classification) classifs as class
|
lateral view explode(p.h2020classification) classifs as class
|
||||||
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
|
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.project_tmp
|
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||||
(
|
with pr_pub as (
|
||||||
id STRING,
|
select pr.id as pr_id, pub.id as pub_id,
|
||||||
acronym STRING,
|
(case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed,
|
||||||
title STRING,
|
max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub
|
||||||
funder STRING,
|
from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication
|
||||||
funding_lvl0 STRING,
|
where datainfo.deletedbyinference = false and datainfo.invisible = false) pub
|
||||||
funding_lvl1 STRING,
|
join ${openaire_db_name}.relation rel
|
||||||
funding_lvl2 STRING,
|
on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id
|
||||||
ec39 STRING,
|
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||||
type STRING,
|
join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project
|
||||||
startdate STRING,
|
where datainfo.deletedbyinference = false and datainfo.invisible = false) pr
|
||||||
enddate STRING,
|
on pr.id=rel.target
|
||||||
start_year INT,
|
group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate
|
||||||
end_year INT,
|
),
|
||||||
duration INT,
|
num_pubs_pr as (
|
||||||
haspubs STRING,
|
select pr_id, count( distinct pub_id) as num_pubs
|
||||||
numpubs INT,
|
from pr_pub
|
||||||
daysforlastpub INT,
|
group by pr_id
|
||||||
delayedpubs INT,
|
),
|
||||||
callidentifier STRING,
|
pub_delayed as (
|
||||||
code STRING,
|
select pr_id, pub_id, max(delayed) as delayed
|
||||||
totalcost FLOAT,
|
from pr_pub
|
||||||
fundedamount FLOAT,
|
group by pr_id, pub_id
|
||||||
currency STRING
|
),
|
||||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
|
num_pub_delayed as (
|
||||||
|
select pr_id, count(distinct pub_id) as num_delayed
|
||||||
|
from pub_delayed
|
||||||
|
where delayed
|
||||||
|
group by pr_id
|
||||||
|
)
|
||||||
|
select /*+ COALESCE(100) */
|
||||||
|
substr(p.id, 4) as id,
|
||||||
|
p.acronym.value as acronym,
|
||||||
|
p.title.value as title,
|
||||||
|
xpath_string(p.fundingtree[0].value, '//funder/name') as funder,
|
||||||
|
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') as funding_lvl0,
|
||||||
|
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') as funding_lvl1,
|
||||||
|
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') as funding_lvl2,
|
||||||
|
p.ecsc39.value as ec39,
|
||||||
|
p.contracttype.classname as type,
|
||||||
|
p.startdate.value as startdate,
|
||||||
|
p.enddate.value as enddate,
|
||||||
|
year(p.startdate.value) as start_year,
|
||||||
|
year(p.enddate.value) as end_year,
|
||||||
|
cast(months_between(p.enddate.value, p.startdate.value) as int) as duration,
|
||||||
|
case when pr_pub.pub_id is null then 'no' else 'yes' end as haspubs,
|
||||||
|
num_pubs_pr.num_pubs as numpubs,
|
||||||
|
pr_pub.daysForlastPub as daysForlastPub,
|
||||||
|
npd.num_delayed as delayedpubs,
|
||||||
|
p.callidentifier.value as callidentifier,
|
||||||
|
p.code.value as code,
|
||||||
|
p.totalcost as totalcost,
|
||||||
|
p.fundedamount as fundedamount,
|
||||||
|
p.currency.value as currency
|
||||||
|
from ${openaire_db_name}.project p
|
||||||
|
left outer join pr_pub on pr_pub.pr_id = p.id
|
||||||
|
left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id
|
||||||
|
left outer join num_pub_delayed npd on npd.pr_id=p.id
|
||||||
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.project_tmp
|
|
||||||
SELECT substr(p.id, 4) AS id,
|
|
||||||
p.acronym.value AS acronym,
|
|
||||||
p.title.value AS title,
|
|
||||||
xpath_string(p.fundingtree[0].value, '//funder/name') AS funder,
|
|
||||||
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0,
|
|
||||||
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1,
|
|
||||||
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2,
|
|
||||||
p.ecsc39.value AS ec39,
|
|
||||||
p.contracttype.classname AS type,
|
|
||||||
p.startdate.value AS startdate,
|
|
||||||
p.enddate.value AS enddate,
|
|
||||||
year(p.startdate.value) AS start_year,
|
|
||||||
year(p.enddate.value) AS end_year,
|
|
||||||
CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration,
|
|
||||||
'no' AS haspubs,
|
|
||||||
0 AS numpubs,
|
|
||||||
0 AS daysforlastpub,
|
|
||||||
0 AS delayedpubs,
|
|
||||||
p.callidentifier.value AS callidentifier,
|
|
||||||
p.code.value AS code,
|
|
||||||
p.totalcost AS totalcost,
|
|
||||||
p.fundedamount AS fundedamount,
|
|
||||||
p.currency.value AS currency
|
|
||||||
FROM ${openaire_db_name}.project p
|
|
||||||
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
|
||||||
|
|
||||||
|
|
|
@ -7,16 +7,16 @@
|
||||||
-- Views on temporary tables that should be re-created in the end
|
-- Views on temporary tables that should be re-created in the end
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.result as
|
CREATE OR REPLACE VIEW ${stats_db_name}.result as
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.publication_tmp
|
FROM ${stats_db_name}.publication
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.software_tmp
|
FROM ${stats_db_name}.software
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.dataset_tmp
|
FROM ${stats_db_name}.dataset
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
|
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
|
||||||
|
|
||||||
-- Views on final tables
|
-- Views on final tables
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
|
||||||
|
@ -153,4 +153,4 @@ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
|
||||||
select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
|
select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
|
||||||
FROM ${stats_db_name}.result r
|
FROM ${stats_db_name}.result r
|
||||||
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
|
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
|
||||||
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/
|
JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/
|
||||||
|
|
|
@ -5,81 +5,36 @@
|
||||||
-- Datasource table/view and Datasource related tables/views
|
-- Datasource table/view and Datasource related tables/views
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; -- /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.datasource_tmp
|
CREATE TABLE ${stats_db_name}.datasource stored as parquet as
|
||||||
(
|
with piwik_datasource as (
|
||||||
`id` string,
|
select id, split(originalidd, '\\:')[1] as piwik_id
|
||||||
`name` STRING,
|
from ${openaire_db_name}.datasource
|
||||||
`type` STRING,
|
lateral view explode(originalid) temp as originalidd
|
||||||
`dateofvalidation` STRING,
|
where originalidd like "piwik:%"
|
||||||
`yearofvalidation` string,
|
)
|
||||||
`harvested` BOOLEAN,
|
select /*+ COALESCE(100) */
|
||||||
`piwik_id` INT,
|
substr(dtrce.id, 4) as id,
|
||||||
`latitude` STRING,
|
case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name,
|
||||||
`longitude` STRING,
|
dtrce.datasourcetype.classname as type,
|
||||||
`websiteurl` STRING,
|
dtrce.dateofvalidation.value as dateofvalidation,
|
||||||
`compatibility` STRING,
|
case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end as yearofvalidation,
|
||||||
issn_printed STRING,
|
case when res.d_id is null then false else true end as harvested,
|
||||||
issn_online STRING
|
case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end as piwik_id,
|
||||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/
|
dtrce.latitude.value as latitude,
|
||||||
|
dtrce.longitude.value as longitude,
|
||||||
|
dtrce.websiteurl.value as websiteurl,
|
||||||
|
dtrce.openairecompatibility.classid as compatibility,
|
||||||
|
dtrce.journal.issnprinted as issn_printed,
|
||||||
|
dtrce.journal.issnonline as issn_online
|
||||||
|
from ${openaire_db_name}.datasource dtrce
|
||||||
|
left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id
|
||||||
|
left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id
|
||||||
|
where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/
|
||||||
|
|
||||||
-- Insert statement that takes into account the piwik_id of the openAIRE graph
|
|
||||||
INSERT INTO ${stats_db_name}.datasource_tmp
|
|
||||||
SELECT substr(d1.id, 4) AS id,
|
|
||||||
officialname.value AS name,
|
|
||||||
datasourcetype.classname AS type,
|
|
||||||
dateofvalidation.value AS dateofvalidation,
|
|
||||||
date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation,
|
|
||||||
FALSE AS harvested,
|
|
||||||
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
|
|
||||||
d1.latitude.value AS latitude,
|
|
||||||
d1.longitude.value AS longitude,
|
|
||||||
d1.websiteurl.value AS websiteurl,
|
|
||||||
d1.openairecompatibility.classid AS compatibility,
|
|
||||||
d1.journal.issnprinted AS issn_printed,
|
|
||||||
d1.journal.issnonline AS issn_online
|
|
||||||
FROM ${openaire_db_name}.datasource d1
|
|
||||||
LEFT OUTER JOIN
|
|
||||||
(SELECT id, split(originalidd, '\\:')[1] as piwik_id
|
|
||||||
FROM ${openaire_db_name}.datasource
|
|
||||||
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
|
|
||||||
WHERE originalidd like "piwik:%") AS d2
|
|
||||||
ON d1.id = d2.id
|
|
||||||
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- /*EOS*/
|
|
||||||
|
|
||||||
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
|
||||||
-- Creating a temporary dual table that will be removed after the following insert
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dual purge; -- /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); -- /*EOS*/
|
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.dual VALUES ('X'); -- /*EOS*/
|
|
||||||
|
|
||||||
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
|
||||||
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
|
||||||
SELECT 'other',
|
|
||||||
'Other',
|
|
||||||
'Repository',
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
false,
|
|
||||||
0,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
'unknown',
|
|
||||||
null,
|
|
||||||
null
|
|
||||||
FROM ${stats_db_name}.dual
|
|
||||||
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -- /*EOS*/
|
|
||||||
DROP TABLE ${stats_db_name}.dual; -- /*EOS*/
|
|
||||||
|
|
||||||
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -- /*EOS*/
|
|
||||||
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -- /*EOS*/
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; -- /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
||||||
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
||||||
|
|
Loading…
Reference in New Issue