2020-07-03 14:27:02 +02:00
|
|
|
-- noinspection SqlNoDataSourceInspectionForFile
|
|
|
|
|
2020-06-15 18:57:40 +02:00
|
|
|
------------------------------------------------------------
|
|
|
|
------------------------------------------------------------
|
|
|
|
-- Datasource table/view and Datasource related tables/views
|
|
|
|
------------------------------------------------------------
|
|
|
|
------------------------------------------------------------
|
2024-04-15 15:22:40 +02:00
|
|
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2021-02-14 02:14:24 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.datasource_tmp
|
|
|
|
(
|
|
|
|
`id` string,
|
|
|
|
`name` STRING,
|
|
|
|
`type` STRING,
|
|
|
|
`dateofvalidation` STRING,
|
|
|
|
`yearofvalidation` string,
|
|
|
|
`harvested` BOOLEAN,
|
|
|
|
`piwik_id` INT,
|
|
|
|
`latitude` STRING,
|
|
|
|
`longitude` STRING,
|
|
|
|
`websiteurl` STRING,
|
2021-07-28 11:28:04 +02:00
|
|
|
`compatibility` STRING,
|
|
|
|
issn_printed STRING,
|
|
|
|
issn_online STRING
|
2024-04-15 15:22:40 +02:00
|
|
|
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2020-06-17 22:48:01 +02:00
|
|
|
-- Insert statement that takes into account the piwik_id of the openAIRE graph
|
2021-02-14 02:14:24 +01:00
|
|
|
INSERT INTO ${stats_db_name}.datasource_tmp
|
|
|
|
SELECT substr(d1.id, 4) AS id,
|
|
|
|
officialname.value AS name,
|
|
|
|
datasourcetype.classname AS type,
|
|
|
|
dateofvalidation.value AS dateofvalidation,
|
|
|
|
date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation,
|
|
|
|
FALSE AS harvested,
|
|
|
|
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
|
|
|
|
d1.latitude.value AS latitude,
|
|
|
|
d1.longitude.value AS longitude,
|
|
|
|
d1.websiteurl.value AS websiteurl,
|
2021-07-28 11:28:04 +02:00
|
|
|
d1.openairecompatibility.classid AS compatibility,
|
|
|
|
d1.journal.issnprinted AS issn_printed,
|
|
|
|
d1.journal.issnonline AS issn_online
|
2020-06-17 22:48:01 +02:00
|
|
|
FROM ${openaire_db_name}.datasource d1
|
2021-02-14 02:14:24 +01:00
|
|
|
LEFT OUTER JOIN
|
|
|
|
(SELECT id, split(originalidd, '\\:')[1] as piwik_id
|
|
|
|
FROM ${openaire_db_name}.datasource
|
|
|
|
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
|
|
|
|
WHERE originalidd like "piwik:%") AS d2
|
|
|
|
ON d1.id = d2.id
|
2024-04-15 15:22:40 +02:00
|
|
|
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
|
|
|
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
|
|
|
-- Creating a temporary dual table that will be removed after the following insert
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/
|
2021-12-13 15:26:14 +01:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/
|
2021-12-13 15:26:14 +01:00
|
|
|
|
2021-02-14 02:14:24 +01:00
|
|
|
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
2021-07-28 20:59:12 +02:00
|
|
|
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT 'other',
|
|
|
|
'Other',
|
|
|
|
'Repository',
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
false,
|
|
|
|
0,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
2021-07-28 20:59:12 +02:00
|
|
|
'unknown',
|
|
|
|
null,
|
|
|
|
null
|
2021-02-14 02:14:24 +01:00
|
|
|
FROM ${stats_db_name}.dual
|
2024-04-15 15:22:40 +02:00
|
|
|
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/
|
|
|
|
DROP TABLE ${stats_db_name}.dual; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/
|
|
|
|
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
2022-02-03 11:37:10 +01:00
|
|
|
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
|
2024-04-15 15:22:40 +02:00
|
|
|
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(d.id, 4) AS id, oids.ids AS oid
|
2022-02-03 11:37:10 +01:00
|
|
|
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
|
2024-04-15 15:22:40 +02:00
|
|
|
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2024-04-15 15:22:40 +02:00
|
|
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
|
|
|
FROM ${openaire_db_name}.relation r
|
2024-04-15 15:22:40 +02:00
|
|
|
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
|
2020-06-15 18:57:40 +02:00
|
|
|
|
2020-07-03 14:27:02 +02:00
|
|
|
-- datasource sources:
|
|
|
|
-- where the datasource info have been collected from.
|
2024-04-15 15:22:40 +02:00
|
|
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
|
2023-11-15 13:32:18 +01:00
|
|
|
|
2022-03-22 15:16:08 +01:00
|
|
|
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
|
2021-02-14 02:14:24 +01:00
|
|
|
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
|
|
|
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
2024-04-15 15:22:40 +02:00
|
|
|
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
|
2020-07-03 14:27:02 +02:00
|
|
|
|
2023-01-04 10:39:01 +01:00
|
|
|
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
2021-02-14 02:14:24 +01:00
|
|
|
SELECT datasource AS id, id AS result
|
2024-04-15 15:22:40 +02:00
|
|
|
FROM ${stats_db_name}.result_datasources; /*EOS*/
|