Update "dhp-stats-update" workflow to use "spark"-actions, instead of "hive" ones.

Note: Currently the code is set to only test the "Step1".
This commit is contained in:
Lampros Smyrnaios 2024-04-15 16:22:40 +03:00
parent d7da4f814b
commit db33f7727c
22 changed files with 627 additions and 382 deletions

View File

@ -0,0 +1,18 @@
# Install the whole "dnet-hadoop" project.
# Delete this module's previous build-files in order to avoid any conflicts.
rm -rf target/ ||
# Go to the root directory of this project.
cd ../../
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Install the project.
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
# We skip tests for all modules, since the take a big amount of time and some of them fail.
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.

View File

@ -0,0 +1,20 @@
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
cat ./target/extract-and-run-on-remote-host.log
# Check oozie workflow status
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
# Get the <job-ID> from the previous output and check the logs:
# yarn logs -applicationId <job-ID>

View File

@ -4,5 +4,5 @@
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
DROP database IF EXISTS ${stats_db_name} CASCADE; DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
CREATE database ${stats_db_name}; CREATE database ${stats_db_name}; /*EOS*/

View File

@ -5,27 +5,27 @@
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
SELECT * SELECT *
FROM ${external_stats_db_name}.fundref; FROM ${external_stats_db_name}.fundref; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.country AS CREATE OR REPLACE VIEW ${stats_db_name}.country AS
SELECT * SELECT *
FROM ${external_stats_db_name}.country; FROM ${external_stats_db_name}.country; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT * SELECT *
FROM ${external_stats_db_name}.countrygdp; FROM ${external_stats_db_name}.countrygdp; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT * SELECT *
FROM ${external_stats_db_name}.roarmap; FROM ${external_stats_db_name}.roarmap; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT * SELECT *
FROM ${external_stats_db_name}.rndexpediture; FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT * SELECT *
FROM ${external_stats_db_name}.licenses_normalized; FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
@ -33,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats; select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
create or replace view ${stats_db_name}.downloads_stats as create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats; select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
create or replace view ${stats_db_name}.pageviews_stats as create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats; select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
create or replace view ${stats_db_name}.views_stats as create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats; select * from openaire_prod_usage_stats.views_stats; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
create table ${stats_db_name}.creation_date STORED AS PARQUET as create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date; select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/

View File

@ -10,7 +10,7 @@ SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d, FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd ${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource); WHERE d.id = rd.datasource); /*EOS*/
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp UPDATE ${stats_db_name}.project_tmp
@ -19,8 +19,8 @@ WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr, FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r ${stats_db_name}.result r
WHERE pr.result = r.id WHERE pr.result = r.id
AND r.type = 'publication'); AND r.type = 'publication'); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.stored purge; DROP TABLE IF EXISTS ${stats_db_name}.stored purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project stored as parquet as CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id, SELECT p.id,
@ -63,7 +63,7 @@ FROM ${stats_db_name}.project_tmp p
AND r.type = 'publication' AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2 GROUP BY pp.id) AS prr2
ON prr2.id = p.id; ON prr2.id = p.id; /*EOS*/
UPDATE ${stats_db_name}.publication_tmp UPDATE ${stats_db_name}.publication_tmp
SET delayed = 'yes' SET delayed = 'yes'
@ -73,7 +73,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p ${stats_db_name}.project_tmp p
WHERE r.id = pr.result WHERE r.id = pr.result
AND pr.id = p.id AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0); AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.dataset_tmp UPDATE ${stats_db_name}.dataset_tmp
SET delayed = 'yes' SET delayed = 'yes'
@ -83,7 +83,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p ${stats_db_name}.project_tmp p
WHERE r.id = pr.result WHERE r.id = pr.result
AND pr.id = p.id AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0); AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.software_tmp UPDATE ${stats_db_name}.software_tmp
SET delayed = 'yes' SET delayed = 'yes'
@ -93,7 +93,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p ${stats_db_name}.project_tmp p
WHERE r.id = pr.result WHERE r.id = pr.result
AND pr.id = p.id AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0); AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.otherresearchproduct_tmp UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes' SET delayed = 'yes'
@ -103,7 +103,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p ${stats_db_name}.project_tmp p
WHERE r.id = pr.result WHERE r.id = pr.result
AND pr.id = p.id AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0); AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result, SELECT result_projects.id AS result,
@ -116,4 +116,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; AND project.id = result_projects.project; /*EOS*/

View File

@ -1,42 +1,42 @@
------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT * SELECT *
FROM ${stats_db_name}.datasource_tmp; FROM ${stats_db_name}.datasource_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
SELECT * SELECT *
FROM ${stats_db_name}.publication_tmp; FROM ${stats_db_name}.publication_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
SELECT * SELECT *
FROM ${stats_db_name}.dataset_tmp; FROM ${stats_db_name}.dataset_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software purge; DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software stored AS parquet AS CREATE TABLE ${stats_db_name}.software stored AS parquet AS
SELECT * SELECT *
FROM ${stats_db_name}.software_tmp; FROM ${stats_db_name}.software_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp; FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.project_tmp; DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.datasource_tmp; DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.publication_tmp; DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.dataset_tmp; DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.software_tmp; DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
---------------------------------------------- ----------------------------------------------
-- Re-creating views from final parquet tables -- Re-creating views from final parquet tables
@ -54,4 +54,4 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset FROM ${stats_db_name}.dataset
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct; FROM ${stats_db_name}.otherresearchproduct; /*EOS*/

View File

@ -5,7 +5,7 @@
-- Sources related tables/views -- Sources related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -16,9 +16,9 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -29,9 +29,9 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -42,9 +42,9 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -55,7 +55,7 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources SELECT * FROM ${stats_db_name}.publication_sources
@ -64,9 +64,9 @@ SELECT * FROM ${stats_db_name}.dataset_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_sources SELECT * FROM ${stats_db_name}.software_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
@ -76,9 +76,9 @@ from (
LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
@ -91,9 +91,9 @@ where reltype='resultResult'
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
@ -108,9 +108,9 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4); group by substr(target, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references select substr(source, 4) as id, count(distinct substr(target, 4)) as references
@ -125,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4); group by substr(source, 4); /*EOS*/

View File

@ -5,33 +5,33 @@
-- Licences related tables/views -- Licences related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses SELECT * FROM ${stats_db_name}.publication_licenses
@ -40,15 +40,15 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_licenses SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
@ -58,10 +58,10 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; /*EOS*/

View File

@ -6,7 +6,7 @@
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -22,9 +22,9 @@ from (
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -40,9 +40,9 @@ from (
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -58,9 +58,9 @@ from (
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -76,7 +76,7 @@ from (
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed select * from ${stats_db_name}.publication_refereed
@ -85,17 +85,17 @@ select * from ${stats_db_name}.dataset_refereed
union all union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads'; where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
@ -104,4 +104,4 @@ rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/

View File

@ -1,25 +1,25 @@
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count select r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count select r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
@ -33,17 +33,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='software' then rcount.count else 0 end) as software,
sum(case when rcount.type='other' then rcount.count else 0 end) as other sum(case when rcount.type='other' then rcount.count else 0 end) as other
from rcount from rcount
group by rcount.pid; group by rcount.pid; /*EOS*/
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
create table if not exists ${stats_db_name}.result_instance stored as parquet as create table if not exists ${stats_db_name}.result_instance stored as parquet as
select distinct r.* select distinct r.*
@ -51,9 +51,9 @@ from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id; join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select distinct r.id, r.amount, r.currency select distinct r.id, r.amount, r.currency
@ -61,6 +61,6 @@ from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null; /*EOS*/
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/

View File

@ -3,26 +3,26 @@
---------------------------------------------------- ----------------------------------------------------
-- Peer reviewed: -- Peer reviewed:
drop table if exists ${stats_db_name}.result_peerreviewed purge; drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
-- Green OA: -- Green OA:
drop table if exists ${stats_db_name}.result_greenoa purge; drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
select r.id, case when green.green_oa=1 then true else false end as green select r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
-- GOLD OA: -- GOLD OA:
drop table if exists ${stats_db_name}.result_gold purge; drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold select r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/

View File

@ -1,6 +1,6 @@
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
-- peer reviewed) -- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp; drop table if exists ${stats_db_name}.result_tmp; /*EOS*/
CREATE TABLE ${stats_db_name}.result_tmp ( CREATE TABLE ${stats_db_name}.result_tmp (
id STRING, id STRING,
@ -20,37 +20,37 @@ CREATE TABLE ${stats_db_name}.result_tmp (
peer_reviewed BOOLEAN, peer_reviewed BOOLEAN,
green BOOLEAN, green BOOLEAN,
gold BOOLEAN) gold BOOLEAN)
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/
insert into ${stats_db_name}.result_tmp insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.publication r FROM ${stats_db_name}.publication r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.dataset r FROM ${stats_db_name}.dataset r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.software r FROM ${stats_db_name}.software r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.otherresearchproduct r FROM ${stats_db_name}.otherresearchproduct r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
drop table if exists ${stats_db_name}.result; drop table if exists ${stats_db_name}.result; /*EOS*/
drop view if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; /*EOS*/
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/
drop table ${stats_db_name}.result_tmp; drop table ${stats_db_name}.result_tmp; /*EOS*/

View File

@ -5,7 +5,7 @@
-------------------------------------------------------------- --------------------------------------------------------------
-- Publication temporary table -- Publication temporary table
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_tmp CREATE TABLE ${stats_db_name}.publication_tmp
( (
id STRING, id STRING,
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.publication_tmp
abstract BOOLEAN, abstract BOOLEAN,
type STRING type STRING
) )
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.publication_tmp INSERT INTO ${stats_db_name}.publication_tmp
SELECT substr(p.id, 4) as id, SELECT substr(p.id, 4) as id,
@ -39,17 +39,17 @@ SELECT substr(p.id, 4) as id,
case when size(p.description) > 0 then true else false end as abstract, case when size(p.description) > 0 then true else false end as abstract,
'publication' as type 'publication' as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, instancetype.classname as type SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
@ -58,9 +58,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -71,44 +71,44 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select substr(p.id, 4) as id, p.language.classname as language select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@ from ${stats_db_name}.result r
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
from ${stats_db_name}.result_licenses rl from ${stats_db_name}.result_licenses rl
group by rl.id group by rl.id
) rln on rln.id=r.id; ) rln on rln.id=r.id; /*EOS*/
create table ${observatory_db_name}.result_affiliated_country stored as parquet as create table ${observatory_db_name}.result_affiliated_country stored as parquet as
@ -35,7 +35,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year stored as parquet as create table ${observatory_db_name}.result_affiliated_year stored as parquet as
@ -65,7 +65,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
@ -95,7 +95,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
@ -127,7 +127,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select select
@ -158,7 +158,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select select
@ -187,7 +187,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select select
@ -216,7 +216,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select select
@ -247,7 +247,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select select
@ -278,7 +278,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_country stored as parquet as create table ${observatory_db_name}.result_deposited_country stored as parquet as
select select
@ -309,7 +309,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_year stored as parquet as create table ${observatory_db_name}.result_deposited_year stored as parquet as
select select
@ -340,7 +340,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
@ -372,7 +372,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select select
@ -403,7 +403,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select select
@ -434,7 +434,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization stored as parquet as create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select select
@ -465,7 +465,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select select
@ -496,7 +496,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder stored as parquet as create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select select
@ -529,7 +529,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select select
@ -562,4 +562,4 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/

View File

@ -5,7 +5,7 @@
------------------------------------------------------ ------------------------------------------------------
-- Dataset temporary table supporting updates -- Dataset temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_tmp CREATE TABLE ${stats_db_name}.dataset_tmp
( (
@ -23,7 +23,7 @@ CREATE TABLE ${stats_db_name}.dataset_tmp
abstract BOOLEAN, abstract BOOLEAN,
type STRING type STRING
) )
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.dataset_tmp INSERT INTO ${stats_db_name}.dataset_tmp
SELECT substr(d.id, 4) AS id, SELECT substr(d.id, 4) AS id,
@ -40,26 +40,26 @@ SELECT substr(d.id, 4) AS id,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
'dataset' AS type 'dataset' AS type
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
@ -68,9 +68,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
@ -82,35 +82,35 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@
-------------------------------------------------------- --------------------------------------------------------
-- Software temporary table supporting updates -- Software temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_tmp CREATE TABLE ${stats_db_name}.software_tmp
( (
id STRING, id STRING,
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.software_tmp
abstract BOOLEAN, abstract BOOLEAN,
type STRING type STRING
) )
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.software_tmp INSERT INTO ${stats_db_name}.software_tmp
SELECT substr(s.id, 4) as id, SELECT substr(s.id, 4) as id,
@ -39,24 +39,24 @@ SELECT substr(s.id, 4) as id,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'software' as type 'software' as type
from ${openaire_db_name}.software s from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
@ -65,9 +65,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
@ -79,35 +79,35 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
select substr(p.id, 4) AS id, p.language.classname AS language select substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Otherresearchproduct temporary table supporting updates -- Otherresearchproduct temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
( (
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
source STRING, source STRING,
abstract BOOLEAN, abstract BOOLEAN,
type STRING type STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
SELECT substr(o.id, 4) AS id, SELECT substr(o.id, 4) AS id,
@ -39,23 +39,23 @@ SELECT substr(o.id, 4) AS id,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'other' AS type 'other' AS type
FROM ${openaire_db_name}.otherresearchproduct o FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/
-- Otherresearchproduct_citations -- Otherresearchproduct_citations
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
@ -63,9 +63,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
@ -74,32 +74,32 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN(SELECT substr(d.id, 4) id LEFT OUTER JOIN(SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -3,38 +3,38 @@
-- Project table/view and Project related tables/views -- Project table/view and Project related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' and r.source like '40|%' WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' and r.target like '40|%' WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/
create table ${stats_db_name}.project_classification STORED AS PARQUET as create table ${stats_db_name}.project_classification STORED AS PARQUET as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_tmp CREATE TABLE ${stats_db_name}.project_tmp
( (
@ -61,7 +61,7 @@ CREATE TABLE ${stats_db_name}.project_tmp
totalcost FLOAT, totalcost FLOAT,
fundedamount FLOAT, fundedamount FLOAT,
currency STRING currency STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.project_tmp INSERT INTO ${stats_db_name}.project_tmp
SELECT substr(p.id, 4) AS id, SELECT substr(p.id, 4) AS id,
@ -88,18 +88,18 @@ SELECT substr(p.id, 4) AS id,
p.fundedamount AS fundedamount, p.fundedamount AS fundedamount,
p.currency.value AS currency p.currency.value AS currency
FROM ${openaire_db_name}.project p FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
create table ${stats_db_name}.funder STORED AS PARQUET as create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id, select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fund, '//funder/shortname') as shortname,
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
@ -107,4 +107,4 @@ properties[0].value contribution, properties[1].value currency
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
LATERAL VIEW explode (r.properties) properties LATERAL VIEW explode (r.properties) properties
where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%'
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/

View File

@ -16,7 +16,7 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset_tmp FROM ${stats_db_name}.dataset_tmp
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct_tmp; FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
-- Views on final tables -- Views on final tables
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
@ -30,7 +30,7 @@ SELECT *
FROM ${stats_db_name}.dataset_datasources FROM ${stats_db_name}.dataset_datasources
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_datasources; FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS
SELECT * SELECT *
@ -43,7 +43,7 @@ SELECT *
FROM ${stats_db_name}.dataset_citations FROM ${stats_db_name}.dataset_citations
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_citations; FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS
SELECT * SELECT *
@ -56,7 +56,7 @@ SELECT *
FROM ${stats_db_name}.dataset_classifications FROM ${stats_db_name}.dataset_classifications
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_classifications; FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS
SELECT * SELECT *
@ -69,7 +69,7 @@ SELECT *
FROM ${stats_db_name}.dataset_concepts FROM ${stats_db_name}.dataset_concepts
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_concepts; FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS
SELECT * SELECT *
@ -82,7 +82,7 @@ SELECT *
FROM ${stats_db_name}.dataset_languages FROM ${stats_db_name}.dataset_languages
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_languages; FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS
SELECT * SELECT *
@ -95,7 +95,7 @@ SELECT *
FROM ${stats_db_name}.dataset_oids FROM ${stats_db_name}.dataset_oids
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_oids; FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS
SELECT * SELECT *
@ -108,7 +108,7 @@ SELECT *
FROM ${stats_db_name}.dataset_pids FROM ${stats_db_name}.dataset_pids
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_pids; FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS
SELECT * SELECT *
@ -121,9 +121,9 @@ SELECT *
FROM ${stats_db_name}.dataset_topics FROM ${stats_db_name}.dataset_topics
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics; FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/
create table ${stats_db_name}.result_fos stored as parquet as create table ${stats_db_name}.result_fos stored as parquet as
with with
@ -133,22 +133,22 @@ with
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
from lvl1 from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization' WHERE r.reltype = 'resultOrganization'
and r.target like '50|%' and r.target like '50|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/

View File

@ -5,7 +5,7 @@
-- Datasource table/view and Datasource related tables/views -- Datasource table/view and Datasource related tables/views
------------------------------------------------------------ ------------------------------------------------------------
------------------------------------------------------------ ------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_tmp CREATE TABLE ${stats_db_name}.datasource_tmp
( (
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp
`compatibility` STRING, `compatibility` STRING,
issn_printed STRING, issn_printed STRING,
issn_online STRING issn_online STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
-- Insert statement that takes into account the piwik_id of the openAIRE graph -- Insert statement that takes into account the piwik_id of the openAIRE graph
INSERT INTO ${stats_db_name}.datasource_tmp INSERT INTO ${stats_db_name}.datasource_tmp
@ -46,14 +46,14 @@ FROM ${openaire_db_name}.datasource d1
LATERAL VIEW EXPLODE(originalid) temp AS originalidd LATERAL VIEW EXPLODE(originalid) temp AS originalidd
WHERE originalidd like "piwik:%") AS d2 WHERE originalidd like "piwik:%") AS d2
ON d1.id = d2.id ON d1.id = d2.id
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert -- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/
INSERT INTO ${stats_db_name}.dual VALUES ('X'); INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
@ -71,42 +71,42 @@ SELECT 'other',
null, null,
null null
FROM ${stats_db_name}.dual FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/
DROP TABLE ${stats_db_name}.dual; DROP TABLE ${stats_db_name}.dual; /*EOS*/
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources; /*EOS*/

View File

@ -3,7 +3,7 @@
-- Organization table/view and Organization related tables/views -- Organization table/view and Organization related tables/views
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.organization purge; DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
SELECT substr(o.id, 4) as id, SELECT substr(o.id, 4) as id,
@ -11,12 +11,12 @@ SELECT substr(o.id, 4) as id,
o.legalshortname.value as legalshortname, o.legalshortname.value as legalshortname,
o.country.classid as country o.country.classid as country
FROM ${openaire_db_name}.organization o FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
SELECT organization AS id, id AS datasource SELECT organization AS id, id AS datasource
FROM ${stats_db_name}.datasource_organizations; FROM ${stats_db_name}.datasource_organizations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS
SELECT id AS project, organization as id SELECT id AS project, organization as id
FROM ${stats_db_name}.project_organizations; FROM ${stats_db_name}.project_organizations; /*EOS*/

View File

@ -154,180 +154,354 @@
</kill> </kill>
<action name="Step1"> <action name="Step1">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step1.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step1</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<ok to="Step2"/> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<!-- <ok to="Step2"/>-->
<ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step2"> <action name="Step2">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step2.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step2</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step3"/> <ok to="Step3"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step3"> <action name="Step3">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step3.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step3</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step4"/> <ok to="Step4"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step4"> <action name="Step4">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step4.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step4</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step5"/> <ok to="Step5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step5"> <action name="Step5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step6"/> <ok to="Step6"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step6"> <action name="Step6">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step6.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step6</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step7"/> <ok to="Step7"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step7"> <action name="Step7">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step7.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step7</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step8"/> <ok to="Step8"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step8"> <action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step8.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step8</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step9"/> <ok to="Step9"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step9"> <action name="Step9">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step9.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step9</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step10"/> <ok to="Step10"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step10"> <action name="Step10">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step10.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step10</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step11"/> <ok to="Step11"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step11"> <action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step11.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step11</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step12"/> <ok to="Step12"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step12"> <action name="Step12">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step12.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step12</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step13"/> <ok to="Step13"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step13"> <action name="Step13">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step13.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step13</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step14"/> <ok to="Step14"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step14"> <action name="Step14">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step14.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step14</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15"/> <ok to="Step15"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step15"> <action name="Step15">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step15.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step15</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15_5"/> <ok to="Step15_5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step15_5"> <action name="Step15_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step15_5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step15_5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Contexts"/> <ok to="Contexts"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -379,23 +553,45 @@
</action> </action>
<action name="Step16_1-definitions"> <action name="Step16_1-definitions">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step16_1-definitions.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step16_1-definitions</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step16_5"/> <ok to="Step16_5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step16_5"> <action name="Step16_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step16_5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step16_5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step19-finalize"/> <ok to="Step19-finalize"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -461,12 +657,23 @@
</action> </action>
<action name="step21-createObservatoryDB"> <action name="step21-createObservatoryDB">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step21-createObservatoryDB.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step21-createObservatoryDB</name>
<param>observatory_db_name=${observatory_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="step21-createObservatoryDB-post"/> <ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>