2020-06-11 20:01:14 +02:00
|
|
|
--------------------------------------------------------------
|
|
|
|
--------------------------------------------------------------
|
2020-06-15 18:57:40 +02:00
|
|
|
-- Publication table/view and Publication related tables/views
|
2020-06-11 20:01:14 +02:00
|
|
|
--------------------------------------------------------------
|
|
|
|
--------------------------------------------------------------
|
|
|
|
|
|
|
|
-- Publication temporary table
|
2021-02-14 02:14:24 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_tmp
|
|
|
|
(
|
|
|
|
id STRING,
|
|
|
|
title STRING,
|
|
|
|
publisher STRING,
|
|
|
|
journal STRING,
|
|
|
|
date STRING,
|
|
|
|
year STRING,
|
|
|
|
bestlicence STRING,
|
|
|
|
embargo_end_date STRING,
|
|
|
|
delayed BOOLEAN,
|
|
|
|
authors INT,
|
|
|
|
source STRING,
|
|
|
|
abstract BOOLEAN,
|
|
|
|
type STRING
|
|
|
|
)
|
|
|
|
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
2020-06-13 12:35:53 +02:00
|
|
|
|
2021-02-14 02:14:24 +01:00
|
|
|
INSERT INTO ${stats_db_name}.publication_tmp
|
|
|
|
SELECT substr(p.id, 4) as id,
|
|
|
|
p.title[0].value as title,
|
|
|
|
p.publisher.value as publisher,
|
|
|
|
p.journal.name as journal,
|
|
|
|
p.dateofacceptance.value as date,
|
|
|
|
date_format(p.dateofacceptance.value, 'yyyy') as year,
|
|
|
|
p.bestaccessright.classname as bestlicence,
|
|
|
|
p.embargoenddate.value as embargo_end_date,
|
|
|
|
false as delayed,
|
|
|
|
size(p.author) as authors,
|
|
|
|
concat_ws('\u003B', p.source.value) as source,
|
|
|
|
case when size(p.description) > 0 then true else false end as abstract,
|
|
|
|
'publication' as type
|
2020-06-13 12:35:53 +02:00
|
|
|
from ${openaire_db_name}.publication p
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-11 20:01:14 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(p.id, 4) as id, instancetype.classname as type
|
|
|
|
from ${openaire_db_name}.publication p
|
|
|
|
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-13 12:35:53 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
|
2021-03-29 15:59:58 +02:00
|
|
|
SELECT substr(p.id, 4) as id, case
|
|
|
|
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
|
|
|
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
|
|
|
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
2021-02-14 02:14:24 +01:00
|
|
|
from ${openaire_db_name}.publication p
|
|
|
|
LATERAL VIEW explode(p.context) contexts as context
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-13 12:35:53 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
|
2020-11-10 16:11:12 +01:00
|
|
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
2021-02-14 02:14:24 +01:00
|
|
|
FROM (
|
|
|
|
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
|
|
|
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
|
2021-02-14 02:14:24 +01:00
|
|
|
LEFT OUTER JOIN (
|
|
|
|
SELECT substr(d.id, 4) id
|
|
|
|
from ${openaire_db_name}.datasource d
|
2022-02-03 12:44:07 +01:00
|
|
|
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
|
2020-06-13 12:35:53 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
select substr(p.id, 4) as id, p.language.classname as language
|
|
|
|
FROM ${openaire_db_name}.publication p
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-11 20:01:14 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
|
|
|
FROM ${openaire_db_name}.publication p
|
|
|
|
LATERAL VIEW explode(p.originalid) oids AS ids
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-11 20:01:14 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
|
|
|
FROM ${openaire_db_name}.publication p
|
|
|
|
LATERAL VIEW explode(p.pid) pids AS ppid
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-11 20:01:14 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
|
2021-02-14 02:14:24 +01:00
|
|
|
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
|
|
|
FROM ${openaire_db_name}.publication p
|
|
|
|
LATERAL VIEW explode(p.subject) subjects AS subject
|
2022-02-03 11:37:10 +01:00
|
|
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
|
2021-07-27 14:14:09 +02:00
|
|
|
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
2021-02-14 02:14:24 +01:00
|
|
|
FROM ${openaire_db_name}.publication p
|
|
|
|
lateral view explode(p.extrainfo) citations AS citation
|
|
|
|
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
2022-02-03 11:37:10 +01:00
|
|
|
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|