1
0
Fork 0
This commit is contained in:
Antonis Lempesis 2024-09-23 15:25:59 +03:00
commit 619aa34a15
30 changed files with 1079 additions and 984 deletions

View File

@ -65,7 +65,13 @@ public class RunSQLSparkJob {
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) { for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
log.info("executing: {}", statement); log.info("executing: {}", statement);
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
spark.sql(statement).show(); try {
spark.sql(statement).show();
} catch (Exception e) {
log.error("Error executing statement: {}", statement, e);
System.err.println("Error executing statement: " + statement + "\n" + e);
throw e;
}
log log
.info( .info(
"executed in {}", "executed in {}",

View File

@ -0,0 +1,18 @@
# Install the whole "dnet-hadoop" project.
# Delete this module's previous build-files in order to avoid any conflicts.
rm -rf target/ ||
# Go to the root directory of this project.
cd ../../
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Install the project.
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
# We skip tests for all modules, since the take a big amount of time and some of them fail.
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.

View File

@ -0,0 +1,20 @@
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
cat ./target/extract-and-run-on-remote-host.log
# Check oozie workflow status
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
# Get the <job-ID> from the previous output and check the logs:
# yarn logs -applicationId application_<job-ID>

View File

@ -1,8 +1,10 @@
set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
-- Stats database creation -- Stats database creation
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
DROP database IF EXISTS ${stats_db_name} CASCADE; DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
CREATE database ${stats_db_name}; CREATE database ${stats_db_name}; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
@ -5,27 +7,27 @@
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
SELECT * SELECT *
FROM ${external_stats_db_name}.fundref; FROM ${external_stats_db_name}.fundref; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.country AS CREATE OR REPLACE VIEW ${stats_db_name}.country AS
SELECT * SELECT *
FROM ${external_stats_db_name}.country; FROM ${external_stats_db_name}.country; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT * SELECT *
FROM ${external_stats_db_name}.countrygdp; FROM ${external_stats_db_name}.countrygdp; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT * SELECT *
FROM ${external_stats_db_name}.roarmap; FROM ${external_stats_db_name}.roarmap; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT * SELECT *
FROM ${external_stats_db_name}.rndexpediture; FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT * SELECT *
FROM ${external_stats_db_name}.licenses_normalized; FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
@ -33,23 +35,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats; select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
create or replace view ${stats_db_name}.downloads_stats as create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats; select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
create or replace view ${stats_db_name}.pageviews_stats as create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats; select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
create or replace view ${stats_db_name}.views_stats as create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats; select * from openaire_prod_usage_stats.views_stats; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
create table ${stats_db_name}.creation_date STORED AS PARQUET as create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date; select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/

View File

@ -1,110 +1,11 @@
set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
-- Post processing - Updates on main tables -- Post processing - Updates on main tables
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
--Datasource temporary table updates
UPDATE ${stats_db_name}.datasource_tmp
SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource);
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp
SET haspubs='yes'
WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pr.result = r.id
AND r.type = 'publication');
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id,
p.acronym,
p.title,
p.funder,
p.funding_lvl0,
p.funding_lvl1,
p.funding_lvl2,
p.ec39,
p.type,
p.startdate,
p.enddate,
p.start_year,
p.end_year,
p.duration,
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
p.callidentifier,
p.code,
p.totalcost,
p.fundedamount,
p.currency
FROM ${stats_db_name}.project_tmp p
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
FROM ${stats_db_name}.project_results pr
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
WHERE r.type = 'publication'
GROUP BY pr.id) AS prr1 on prr1.id = p.id
LEFT JOIN (SELECT pp.id,
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
count(distinct r.id) AS dp
FROM ${stats_db_name}.project_tmp pp,
${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pp.id = pr.id
AND pr.result = r.id
AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2
ON prr2.id = p.id;
UPDATE ${stats_db_name}.publication_tmp
SET delayed = 'yes'
WHERE publication_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.dataset_tmp
SET delayed = 'yes'
WHERE dataset_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.software_tmp
SET delayed = 'yes'
WHERE software_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes'
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result, SELECT result_projects.id AS result,
result_projects.project AS project_results, result_projects.project AS project_results,
@ -116,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; AND project.id = result_projects.project; /*EOS*/

View File

@ -1,42 +1,4 @@
------------------------------------------------------------------------------------------------------ set mapred.job.queue.name=analytics; /*EOS*/
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT *
FROM ${stats_db_name}.datasource_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
SELECT *
FROM ${stats_db_name}.publication_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
SELECT *
FROM ${stats_db_name}.dataset_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
SELECT *
FROM ${stats_db_name}.software_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE ${stats_db_name}.project_tmp;
DROP TABLE ${stats_db_name}.datasource_tmp;
DROP TABLE ${stats_db_name}.publication_tmp;
DROP TABLE ${stats_db_name}.dataset_tmp;
DROP TABLE ${stats_db_name}.software_tmp;
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
---------------------------------------------- ----------------------------------------------
-- Re-creating views from final parquet tables -- Re-creating views from final parquet tables
@ -54,4 +16,4 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset FROM ${stats_db_name}.dataset
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct; FROM ${stats_db_name}.otherresearchproduct; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -5,10 +7,10 @@
-- Sources related tables/views -- Sources related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -16,12 +18,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -29,12 +31,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -42,12 +44,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -55,7 +57,7 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources SELECT * FROM ${stats_db_name}.publication_sources
@ -64,24 +66,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_sources SELECT * FROM ${stats_db_name}.software_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
from ( from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid SELECT substr(res.id, 4) as id, auth_pid.value as orcid
FROM ${openaire_db_name}.result res FROM ${openaire_db_name}.result res
LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -91,12 +93,12 @@ where reltype='resultResult'
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -108,12 +110,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4); group by substr(target, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -125,4 +127,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4); group by substr(source, 4); /*EOS*/

View File

@ -1,4 +1,5 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -6,33 +7,33 @@ set mapred.job.queue.name=analytics;
-- Licences related tables/views -- Licences related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses SELECT * FROM ${stats_db_name}.publication_licenses
@ -41,29 +42,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_licenses SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource
FROM ( FROM (
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/

View File

@ -1,4 +1,4 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -18,15 +18,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -36,15 +36,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -54,15 +54,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -72,13 +72,13 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed select * from ${stats_db_name}.publication_refereed
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
union all union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads'; where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
cast(rel.properties[0].value as double) apc_amount, cast(rel.properties[0].value as double) apc_amount,
rel.properties[1].value apc_currency rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/

View File

@ -1,27 +1,27 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
@ -30,39 +30,39 @@ with rcount as (
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
left outer join ${stats_db_name}.result r on r.id=rp.id left outer join ${stats_db_name}.result r on r.id=rp.id
group by r.type, p.id ) group by r.type, p.id )
select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='software' then rcount.count else 0 end) as software,
sum(case when rcount.type='other' then rcount.count else 0 end) as other sum(case when rcount.type='other' then rcount.count else 0 end) as other
from rcount from rcount
group by rcount.pid; group by rcount.pid; /*EOS*/
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
create table if not exists ${stats_db_name}.result_instance stored as parquet as create table if not exists ${stats_db_name}.result_instance stored as parquet as
select distinct r.* select /*+ COALESCE(100) */ distinct r.*
from ( from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id; join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select distinct r.id, r.amount, r.currency select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency
from ( from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null; /*EOS*/
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/

View File

@ -1,7 +1,7 @@
-- Sprint 1 ---- -- Sprint 1 ----
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
@ -12,7 +12,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
@ -23,7 +23,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
@ -33,7 +33,7 @@ left outer join (
-- Sprint 2 ---- -- Sprint 2 ----
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, license.type as lic from ${stats_db_name}.result r select r.id, license.type as lic from ${stats_db_name}.result r
@ -42,7 +42,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, lower(parse_url(license.type, "HOST")) as lic_host select r.id, lower(parse_url(license.type, "HOST")) as lic_host
@ -52,12 +52,12 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from ${stats_db_name}.publication; /*EOS*/ from ${stats_db_name}.publication; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/ select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
@ -66,7 +66,7 @@ left outer join (
---- Sprint 3 ---- ---- Sprint 3 ----
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
select distinct r.result as id, coalesce(fundref, 0) as fundref select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref
from ${stats_db_name}.project_results r from ${stats_db_name}.project_results r
left outer join ( left outer join (
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/ select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
SELECT ro.organization organization, ro.id, o.name SELECT ro.organization organization, ro.id, o.name
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null) join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization join ${stats_db_name}.organization o on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
select o.id organization, o.name, ro.project as project select o.id organization, o.name, ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.name<>o2.name where o1.organization<>o2.organization and o1.name<>o2.name
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
select o.id organization, o.name, o.country , ro.project as project select o.id organization, o.name, o.country , ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.organization o on o.id=op.id
join ${stats_db_name}.project p on p.id=op.project join ${stats_db_name}.project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
select distinct country, ro.id as result from ${stats_db_name}.organization o select distinct country, ro.id as result from ${stats_db_name}.organization o
join ${stats_db_name}.result_organization ro on o.id=ro.organization join ${stats_db_name}.result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select o1.country country1, o2.country country2, count(o1.result) as collaborations select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
---- Sprint 4 ---- ---- Sprint 4 ----
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal select pd.id, 1 as in_diamond_journal
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from ${stats_db_name}.publication pd from ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative select pd.id, 1 as is_transformative
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
from ${stats_db_name}.result_instance ri from ${stats_db_name}.result_instance ri
left outer join ( left outer join (
select ri.id, 1 as pub_closed_other_open select ri.id, 1 as pub_closed_other_open
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
---- Sprint 5 ---- ---- Sprint 5 ----
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies select /*+ COALESCE(100) */ id, count(id) as number_of_copies
from ${stats_db_name}.result_instance from ${stats_db_name}.result_instance
group by id; /*EOS*/ group by id; /*EOS*/
---- Sprint 6 ---- ---- Sprint 6 ----
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id where downloads>0 join ${stats_db_name}.publication on result_id=id where downloads>0
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
UNION ALL UNION ALL
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
) )
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM ${stats_db_name}.publication pd FROM ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_gold select pd.id, 1 as is_gold
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
FROM ${stats_db_name}.datasource FROM ${stats_db_name}.datasource
WHERE issn_online IS NOT NULL ) as issn WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
select distinct p.id, coalesce(is_hybrid, 0) is_hybrid select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_hybrid select p.id, 1 as is_hybrid
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
where cast(year as int)>2003 where cast(year as int)>2003
group by ro.organization) group by ro.organization)
--return results_fair/all_results --return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; /*EOS*/ join result_fair on result_fair.organization=allresults.organization; /*EOS*/
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar from allresults ar
join result_fair rf on rf.organization=ar.organization; /*EOS*/ join result_fair rf on rf.organization=ar.organization; /*EOS*/
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/ join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar join result_fair rf from allresults ar join result_fair rf
on rf.organization=ar.organization; /*EOS*/ on rf.organization=ar.organization; /*EOS*/
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/ join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
select allpubsshare.organization, select /*+ COALESCE(100) */ allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
select cast(allpubsshare.year as int) year, allpubsshare.organization, select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint
from ${stats_db_name}.publication_classifications p from ${stats_db_name}.publication_classifications p
left outer join ( left outer join (
select p.id, 1 as has_preprint select p.id, 1 as has_preprint
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join( left outer join(
select p.id, 1 as is_subscription from ${stats_db_name}.publication p select p.id, 1 as is_subscription from ${stats_db_name}.publication p
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from ${stats_db_name}.result p from ${stats_db_name}.result p
left outer join ( left outer join (
select p.id, 1 as result_with_pid select p.id, 1 as result_with_pid
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
select distinct p.id as id, coalesce(is_interdisciplinary, 0) select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0)
as is_interdisciplinary as is_interdisciplinary
from pub_fos_totals p from pub_fos_totals p
left outer join ( left outer join (
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_bronze_oa select p.id, 1 as is_bronze_oa
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
select pry.project_id, pry.acronym, pry.result_id, select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id,
coalesce(is_project_result_after, 0) as is_project_result_after coalesce(is_project_result_after, 0) as is_project_result_after
from project_year_result_year pry from project_year_result_year pry
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
from ${stats_db_name}.funder f from ${stats_db_name}.funder f
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
from allresults from allresults
join result_fair on result_fair.funder=allresults.funder; /*EOS*/ join result_fair on result_fair.funder=allresults.funder; /*EOS*/
@ -745,7 +745,7 @@ allresults as
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(year as int)>2003 where cast(year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
from allresults from allresults
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
@ -817,16 +817,14 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
select allpubsshare.funder, select /*+ COALESCE(100) */ allpubsshare.funder,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end)) funder_openess
funder_openess FROM allpubsshare FROM allpubsshare
left outer join (select funder,d from left outer join (select funder,d from alldatasetssshare) tmp1
alldatasetssshare) tmp1 on tmp1.funder=allpubsshare.funder
on tmp1.funder=allpubsshare.funder left outer join (select funder,s from allsoftwaresshare) tmp2
left outer join (select funder,s from on tmp2.funder=allpubsshare.funder; /*EOS*/
allsoftwaresshare) tmp2
on tmp2.funder=allpubsshare.funder; /*EOS*/
DROP VIEW pubs_oa; /*EOS*/ DROP VIEW pubs_oa; /*EOS*/
DROP VIEW datasets_oa; /*EOS*/ DROP VIEW datasets_oa; /*EOS*/
@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
select allpubsshare.ri_initiative, select /*+ COALESCE(100) */ allpubsshare.ri_initiative,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
ris_openess FROM allpubsshare ris_openess FROM allpubsshare
@ -943,7 +941,7 @@ with result_findable as
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
from allresults from allresults
join result_findable on result_findable.funder=allresults.funder; /*EOS*/ join result_findable on result_findable.funder=allresults.funder; /*EOS*/
@ -952,41 +950,43 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
with result_contexts as with result_contexts as
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc (select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
join ${stats_db_name}.concept on concept.id=rc.concept join ${stats_db_name}.concept on concept.id=rc.concept
join ${stats_db_name}.category on category.id=concept.category join ${stats_db_name}.category on category.id=concept.category
join ${stats_db_name}.context on context.id=category.context), join ${stats_db_name}.context on context.id=category.context),
result_findable as result_findable as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
join ${stats_db_name}.result_pids rp on rp.id=r.id join ${stats_db_name}.result_pids rp on rp.id=r.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative), group by rc.ri_initiative),
allresults as allresults as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
from allresults from allresults
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
with org_names_pids as with org_names_pids as
(select org.id,name, pid from ${stats_db_name}.organization org (select org.id,name, pid from ${stats_db_name}.organization org
join ${stats_db_name}.organization_pids op on org.id=op.id), join ${stats_db_name}.organization_pids op on org.id=op.id),
publicly_funded_orgs as publicly_funded_orgs as
(select distinct name from (select distinct name from
(select pf.name from stats_ext.insitutions_for_publicly_funded pf (select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
union all union all
select pf.name from stats_ext.insitutions_for_publicly_funded pf select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.project p on p.funder=pf.name join ${stats_db_name}.project p on p.funder=pf.name
union all union all
select op.name from stats_ext.insitutions_for_publicly_funded pf select op.name from stats_ext.insitutions_for_publicly_funded pf
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
and pf.publicly_funded='yes') foo) and pf.publicly_funded='yes') foo)
select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
select distinct p.id, coalesce(green_with_license, 0) as green_with_license select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
@ -1006,7 +1006,7 @@ left outer join (
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
create table ${stats_db_name}.result_country stored as parquet as create table ${stats_db_name}.result_country stored as parquet as
select distinct id, country select /*+ COALESCE(100) */ distinct id, country
from ( from (
select ro.id, o.country select ro.id, o.country
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
select distinct r.id, coalesce(oa_with_license,0) as oa_with_license select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
with without_license as with without_license as
(select distinct id from ${stats_db_name}.indi_result_oa_with_license (select distinct id from ${stats_db_name}.indi_result_oa_with_license
where oa_with_license=0) where oa_with_license=0)
select distinct r.id, coalesce(oa_without_license,0) as oa_without_license select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_without_license left outer join (select distinct r.id, 1 as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
with transformative_dois as ( with transformative_dois as (
select distinct doi from stats_ext.transformative_facts) select distinct doi from stats_ext.transformative_facts)
select distinct r.id, coalesce(under_transformative,0) as under_transformative select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select distinct rp.id, 1 as under_transformative select distinct rp.id, 1 as under_transformative

View File

@ -1,30 +1,30 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------- ----------------------------------------------------
-- Shortcuts for various definitions in stats db --- -- Shortcuts for various definitions in stats db ---
---------------------------------------------------- ----------------------------------------------------
-- Peer reviewed: -- Peer reviewed:
drop table if exists ${stats_db_name}.result_peerreviewed purge; drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
-- Green OA: -- Green OA:
drop table if exists ${stats_db_name}.result_greenoa purge; drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
select r.id, case when green.green_oa=1 then true else false end as green select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
-- GOLD OA: -- GOLD OA:
drop table if exists ${stats_db_name}.result_gold purge; drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/

View File

@ -1,58 +1,26 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
-- peer reviewed) -- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp;
CREATE TABLE ${stats_db_name}.result_tmp ( drop view if exists ${stats_db_name}.result; /*EOS*/
id STRING, drop table if exists ${stats_db_name}.result; /*EOS*/
title STRING,
publisher STRING,
journal STRING,
`date` STRING,
`year` INT,
bestlicence STRING,
access_mode STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING ,
peer_reviewed BOOLEAN,
green BOOLEAN,
gold BOOLEAN)
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
insert into ${stats_db_name}.result_tmp CREATE TABLE ${stats_db_name}.result stored as parquet as
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.publication r FROM (
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.publication)
UNION ALL
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.dataset)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.software)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.otherresearchproduct)
) r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.dataset r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.software r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.otherresearchproduct r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp;

View File

@ -1,4 +1,4 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics;
-------------------------------------------------------------- --------------------------------------------------------------
-- Publication temporary table -- Publication temporary table
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.publication_tmp CREATE TABLE ${stats_db_name}.publication stored as parquet as
SELECT substr(p.id, 4) as id, with pub_pr as (
p.title[0].value as title, select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
p.publisher.value as publisher, from ${openaire_db_name}.publication pub
p.journal.name as journal, join ${openaire_db_name}.relation rel
p.dateofacceptance.value as date, on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
date_format(p.dateofacceptance.value, 'yyyy') as year, and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
p.bestaccessright.classname as bestlicence, join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
p.embargoenddate.value as embargo_end_date, where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
false as delayed, ),
size(p.author) as authors, pub_delayed as (
concat_ws('\u003B', p.source.value) as source, select pub_id, max(delayed) as delayed
case when size(p.description) > 0 then true else false end as abstract, from pub_pr
'publication' as type group by pub_id
from ${openaire_db_name}.publication p )
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; select /*+ COALESCE(100) */
substr(pub.id, 4) as id,
pub.title[0].value as title,
pub.publisher.value as publisher,
pub.journal.name as journal,
pub.dateofacceptance.value as date,
date_format(pub.dateofacceptance.value, 'yyyy') as year,
pub.bestaccessright.classname as bestlicence,
pub.embargoenddate.value as embargo_end_date,
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
size(pub.author) as authors,
concat_ws('\u003B', pub.source.value) as source,
case when size(pub.description) > 0 then true else false end as abstract,
'publication' as type
from ${openaire_db_name}.publication pub
left outer join pub_delayed on pub.id=pub_delayed.pub_id
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, instancetype.classname as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
@ -73,44 +73,44 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select substr(p.id, 4) as id, p.language.classname as language select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context; create view if not exists TARGET.context as select * from SOURCE.context;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

View File

@ -1,15 +1,17 @@
set mapred.job.queue.name=analytics; /*EOS*/
create table ${observatory_db_name}.result_cc_licence stored as parquet as create table ${observatory_db_name}.result_cc_licence stored as parquet as
select r.id, coalesce(rln.count, 0) > 0 as cc_licence select /*+ COALESCE(100) */ r.id, coalesce(rln.count, 0) > 0 as cc_licence
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
from ${stats_db_name}.result_licenses rl from ${stats_db_name}.result_licenses rl
group by rl.id group by rl.id
) rln on rln.id=r.id; ) rln on rln.id=r.id; /*EOS*/
create table ${observatory_db_name}.result_affiliated_country stored as parquet as create table ${observatory_db_name}.result_affiliated_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -35,11 +37,11 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year stored as parquet as create table ${observatory_db_name}.result_affiliated_year stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -65,11 +67,11 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -95,11 +97,11 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -127,10 +129,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -158,10 +160,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -187,10 +189,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -216,10 +218,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -247,10 +249,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -278,10 +280,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_country stored as parquet as create table ${observatory_db_name}.result_deposited_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -309,10 +311,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_year stored as parquet as create table ${observatory_db_name}.result_deposited_year stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -340,11 +342,11 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -372,10 +374,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -403,10 +405,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -434,10 +436,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization stored as parquet as create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -465,10 +467,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -496,10 +498,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder stored as parquet as create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -529,10 +531,10 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select select /*+ COALESCE(100) */
count(distinct r.id) as total, count(distinct r.id) as total,
r.green, r.green,
r.gold, r.gold,
@ -562,4 +564,4 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Dataset table/view and Dataset related tables/views -- Dataset table/view and Dataset related tables/views
@ -5,75 +7,74 @@
------------------------------------------------------ ------------------------------------------------------
-- Dataset temporary table supporting updates -- Dataset temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_tmp CREATE TABLE ${stats_db_name}.dataset stored as parquet as
( with datast_pr as (
id STRING, select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
title STRING, from ${openaire_db_name}.dataset datast
publisher STRING, join ${openaire_db_name}.relation rel
journal STRING, on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id
date STRING, and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
year STRING, join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
bestlicence STRING, where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false
embargo_end_date STRING, ),
delayed BOOLEAN, datast_delayed as (
authors INT, select datast_id, max(delayed) as delayed
source STRING, from datast_pr
abstract BOOLEAN, group by datast_id
type STRING
) )
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); select /*+ COALESCE(100) */
substr(datast.id, 4) as id,
datast.title[0].value as title,
datast.publisher.value as publisher,
cast(null as string) as journal,
datast.dateofacceptance.value as date,
date_format(datast.dateofacceptance.value, 'yyyy') as year,
datast.bestaccessright.classname as bestlicence,
datast.embargoenddate.value as embargo_end_date,
coalesce(datast_delayed.delayed, false) as delayed, -- It's delayed, when the dataset was published after the end of the project.
size(datast.author) as authors,
concat_ws('\u003B', datast.source.value) as source,
case when size(datast.description) > 0 then true else false end as abstract,
'dataset' as type
from ${openaire_db_name}.dataset datast
left outer join datast_delayed on datast.id=datast_delayed.datast_id
where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/
INSERT INTO ${stats_db_name}.dataset_tmp
SELECT substr(d.id, 4) AS id,
d.title[0].value AS title,
d.publisher.value AS publisher,
cast(null AS string) AS journal,
d.dateofacceptance.value as date,
date_format(d.dateofacceptance.value, 'yyyy') AS year,
d.bestaccessright.classname AS bestlicence,
d.embargoenddate.value AS embargo_end_date,
false AS delayed,
size(d.author) AS authors,
concat_ws('\u003B', d.source.value) AS source,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
'dataset' AS type
FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
@ -82,35 +83,35 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------- --------------------------------------------------------
-------------------------------------------------------- --------------------------------------------------------
-- Software table/view and Software related tables/views -- Software table/view and Software related tables/views
@ -5,72 +7,74 @@
-------------------------------------------------------- --------------------------------------------------------
-- Software temporary table supporting updates -- Software temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_tmp
( CREATE TABLE ${stats_db_name}.software stored as parquet as
id STRING, with soft_pr as (
title STRING, select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
publisher STRING, from ${openaire_db_name}.software soft
journal STRING, join ${openaire_db_name}.relation rel
date STRING, on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id
year STRING, and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
bestlicence STRING, join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
embargo_end_date STRING, where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false
delayed BOOLEAN, ),
authors INT, soft_delayed as (
source STRING, select soft_id, max(delayed) as delayed
abstract BOOLEAN, from soft_pr
type STRING group by soft_id
) )
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); select /*+ COALESCE(100) */
substr(soft.id, 4) as id,
soft.title[0].value as title,
soft.publisher.value as publisher,
cast(null as string) as journal,
soft.dateofacceptance.value as date,
date_format(soft.dateofacceptance.value, 'yyyy') as year,
soft.bestaccessright.classname as bestlicence,
soft.embargoenddate.value as embargo_end_date,
coalesce(soft_delayed.delayed, false) as delayed, -- It's delayed, when the software was published after the end of the project.
size(soft.author) as authors,
concat_ws('\u003B', soft.source.value) as source,
case when size(soft.description) > 0 then true else false end as abstract,
'software' as type
from ${openaire_db_name}.software soft
left outer join soft_delayed on soft.id=soft_delayed.soft_id
where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/
INSERT INTO ${stats_db_name}.software_tmp
SELECT substr(s.id, 4) as id,
s.title[0].value AS title,
s.publisher.value AS publisher,
CAST(NULL AS string) AS journal,
s.dateofacceptance.value AS DATE,
date_format(s.dateofacceptance.value, 'yyyy') AS YEAR,
s.bestaccessright.classname AS bestlicence,
s.embargoenddate.value AS embargo_end_date,
FALSE AS delayed,
SIZE(s.author) AS authors,
concat_ws('\u003B', s.source.value) AS source,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'software' as type
from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
@ -79,35 +83,35 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
select substr(p.id, 4) AS id, p.language.classname AS language select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Otherresearchproduct table/view and Otherresearchproduct related tables/views -- Otherresearchproduct table/view and Otherresearchproduct related tables/views
@ -5,101 +7,103 @@
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Otherresearchproduct temporary table supporting updates -- Otherresearchproduct temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as
( with other_pr as (
id STRING, select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
title STRING, from ${openaire_db_name}.otherresearchproduct other
publisher STRING, join ${openaire_db_name}.relation rel
journal STRING, on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id
date STRING, and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
year STRING, join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
bestlicence STRING, where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false
embargo_end_date STRING, ),
delayed BOOLEAN, other_delayed as (
authors INT, select other_id, max(delayed) as delayed
source STRING, from other_pr
abstract BOOLEAN, group by other_id
type STRING )
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); select /*+ COALESCE(100) */
substr(other.id, 4) as id,
other.title[0].value as title,
other.publisher.value as publisher,
cast(null as string) as journal,
other.dateofacceptance.value as date,
date_format(other.dateofacceptance.value, 'yyyy') as year,
other.bestaccessright.classname as bestlicence,
other.embargoenddate.value as embargo_end_date,
false as delayed,
size(other.author) as authors,
concat_ws('\u003B', other.source.value) as source,
case when size(other.description) > 0 then true else false end as abstract,
'other' as type
from ${openaire_db_name}.otherresearchproduct other
left outer join other_delayed on other.id=other_delayed.other_id
where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
SELECT substr(o.id, 4) AS id,
o.title[0].value AS title,
o.publisher.value AS publisher,
CAST(NULL AS string) AS journal,
o.dateofacceptance.value AS DATE,
date_format(o.dateofacceptance.value, 'yyyy') AS year,
o.bestaccessright.classname AS bestlicence,
o.embargoenddate.value as embargo_end_date,
FALSE AS delayed,
SIZE(o.author) AS authors,
concat_ws('\u003B', o.source.value) AS source,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'other' AS type
FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false;
-- Otherresearchproduct_citations -- Otherresearchproduct_citations
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN(SELECT substr(d.id, 4) id LEFT OUTER JOIN (SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -1,110 +1,120 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Project table/view and Project related tables/views -- Project table/view and Project related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' and r.source like '40|%' WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' and r.target like '40|%' WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/
create table ${stats_db_name}.project_classification STORED AS PARQUET as create table ${stats_db_name}.project_classification STORED AS PARQUET as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_tmp CREATE TABLE ${stats_db_name}.project stored as parquet as
( with pr_pub as (
id STRING, select pr.id as pr_id, pub.id as pub_id,
acronym STRING, (case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed,
title STRING, max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub
funder STRING, from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication
funding_lvl0 STRING, where datainfo.deletedbyinference = false and datainfo.invisible = false) pub
funding_lvl1 STRING, join ${openaire_db_name}.relation rel
funding_lvl2 STRING, on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id
ec39 STRING, and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
type STRING, join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project
startdate STRING, where datainfo.deletedbyinference = false and datainfo.invisible = false) pr
enddate STRING, on pr.id=rel.target
start_year INT, group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate
end_year INT, ),
duration INT, num_pubs_pr as (
haspubs STRING, select pr_id, count( distinct pub_id) as num_pubs
numpubs INT, from pr_pub
daysforlastpub INT, group by pr_id
delayedpubs INT, ),
callidentifier STRING, pub_delayed as (
code STRING, select pr_id, pub_id, max(delayed) as delayed
totalcost FLOAT, from pr_pub
fundedamount FLOAT, group by pr_id, pub_id
currency STRING ),
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); num_pub_delayed as (
select pr_id, count(distinct pub_id) as num_delayed
from pub_delayed
where delayed
group by pr_id
)
select /*+ COALESCE(100) */
substr(p.id, 4) as id,
p.acronym.value as acronym,
p.title.value as title,
xpath_string(p.fundingtree[0].value, '//funder/name') as funder,
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') as funding_lvl0,
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') as funding_lvl1,
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') as funding_lvl2,
p.ecsc39.value as ec39,
p.contracttype.classname as type,
p.startdate.value as startdate,
p.enddate.value as enddate,
year(p.startdate.value) as start_year,
year(p.enddate.value) as end_year,
cast(months_between(p.enddate.value, p.startdate.value) as int) as duration,
case when pr_pub.pub_id is null then 'no' else 'yes' end as haspubs,
num_pubs_pr.num_pubs as numpubs,
pr_pub.daysForlastPub as daysForlastPub,
npd.num_delayed as delayedpubs,
p.callidentifier.value as callidentifier,
p.code.value as code,
p.totalcost as totalcost,
p.fundedamount as fundedamount,
p.currency.value as currency
from ${openaire_db_name}.project p
left outer join pr_pub on pr_pub.pr_id = p.id
left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id
left outer join num_pub_delayed npd on npd.pr_id=p.id
where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/
INSERT INTO ${stats_db_name}.project_tmp
SELECT substr(p.id, 4) AS id,
p.acronym.value AS acronym,
p.title.value AS title,
xpath_string(p.fundingtree[0].value, '//funder/name') AS funder,
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0,
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1,
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2,
p.ecsc39.value AS ec39,
p.contracttype.classname AS type,
p.startdate.value AS startdate,
p.enddate.value AS enddate,
year(p.startdate.value) AS start_year,
year(p.enddate.value) AS end_year,
CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration,
'no' AS haspubs,
0 AS numpubs,
0 AS daysforlastpub,
0 AS delayedpubs,
p.callidentifier.value AS callidentifier,
p.code.value AS code,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
p.currency.value AS currency
FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
create table ${stats_db_name}.funder STORED AS PARQUET as create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id, select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fund, '//funder/shortname') as shortname,
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
properties[0].value contribution, properties[1].value currency properties[0].value contribution, properties[1].value currency
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
LATERAL VIEW explode (r.properties) properties LATERAL VIEW explode (r.properties) properties
where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%'
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------- ----------------------------------------------------
---------------------------------------------------- ----------------------------------------------------
-- Result table/view and Result related tables/views -- Result table/view and Result related tables/views
@ -7,16 +9,16 @@
-- Views on temporary tables that should be re-created in the end -- Views on temporary tables that should be re-created in the end
CREATE OR REPLACE VIEW ${stats_db_name}.result as CREATE OR REPLACE VIEW ${stats_db_name}.result as
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.publication_tmp FROM ${stats_db_name}.publication
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.software_tmp FROM ${stats_db_name}.software
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset_tmp FROM ${stats_db_name}.dataset
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct_tmp; FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
-- Views on final tables -- Views on final tables
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
@ -30,7 +32,7 @@ SELECT *
FROM ${stats_db_name}.dataset_datasources FROM ${stats_db_name}.dataset_datasources
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_datasources; FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS
SELECT * SELECT *
@ -43,7 +45,7 @@ SELECT *
FROM ${stats_db_name}.dataset_citations FROM ${stats_db_name}.dataset_citations
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_citations; FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS
SELECT * SELECT *
@ -56,7 +58,7 @@ SELECT *
FROM ${stats_db_name}.dataset_classifications FROM ${stats_db_name}.dataset_classifications
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_classifications; FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS
SELECT * SELECT *
@ -69,7 +71,7 @@ SELECT *
FROM ${stats_db_name}.dataset_concepts FROM ${stats_db_name}.dataset_concepts
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_concepts; FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS
SELECT * SELECT *
@ -82,7 +84,7 @@ SELECT *
FROM ${stats_db_name}.dataset_languages FROM ${stats_db_name}.dataset_languages
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_languages; FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS
SELECT * SELECT *
@ -95,7 +97,7 @@ SELECT *
FROM ${stats_db_name}.dataset_oids FROM ${stats_db_name}.dataset_oids
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_oids; FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS
SELECT * SELECT *
@ -108,7 +110,7 @@ SELECT *
FROM ${stats_db_name}.dataset_pids FROM ${stats_db_name}.dataset_pids
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_pids; FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS
SELECT * SELECT *
@ -121,37 +123,44 @@ SELECT *
FROM ${stats_db_name}.dataset_topics FROM ${stats_db_name}.dataset_topics
UNION ALL UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics; FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/
create table ${stats_db_name}.result_fos_base_tmp stored as parquet as
select /*+ COALESCE(100) */ id, topic from ${stats_db_name}.result_topics where type='Fields of Science and Technology classification'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/
create table ${stats_db_name}.result_fos stored as parquet as create table ${stats_db_name}.result_fos stored as parquet as
with with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl1 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '__ %'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), lvl2 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '____ %'),
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'), lvl3 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '______ %'),
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') lvl4 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '________ %')
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
from lvl1 from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/
DROP TABLE ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization' WHERE r.reltype = 'resultOrganization'
and r.target like '50|%' and r.target like '50|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
-- noinspection SqlNoDataSourceInspectionForFile -- noinspection SqlNoDataSourceInspectionForFile
------------------------------------------------------------ ------------------------------------------------------------
@ -5,108 +7,65 @@
-- Datasource table/view and Datasource related tables/views -- Datasource table/view and Datasource related tables/views
------------------------------------------------------------ ------------------------------------------------------------
------------------------------------------------------------ ------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_tmp CREATE TABLE ${stats_db_name}.datasource stored as parquet as
( with piwik_datasource as (
`id` string, select id, split(originalidd, '\\:')[1] as piwik_id
`name` STRING, from ${openaire_db_name}.datasource
`type` STRING, lateral view explode(originalid) temp as originalidd
`dateofvalidation` STRING, where originalidd like "piwik:%"
`yearofvalidation` string, )
`harvested` BOOLEAN, select /*+ COALESCE(100) */
`piwik_id` INT, substr(dtrce.id, 4) as id,
`latitude` STRING, case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name,
`longitude` STRING, dtrce.datasourcetype.classname as type,
`websiteurl` STRING, dtrce.dateofvalidation.value as dateofvalidation,
`compatibility` STRING, case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end as yearofvalidation,
issn_printed STRING, case when res.d_id is null then false else true end as harvested,
issn_online STRING case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end as piwik_id,
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); dtrce.latitude.value as latitude,
dtrce.longitude.value as longitude,
dtrce.websiteurl.value as websiteurl,
dtrce.openairecompatibility.classid as compatibility,
dtrce.journal.issnprinted as issn_printed,
dtrce.journal.issnonline as issn_online
from ${openaire_db_name}.datasource dtrce
left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id
left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id
where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/
-- Insert statement that takes into account the piwik_id of the openAIRE graph
INSERT INTO ${stats_db_name}.datasource_tmp
SELECT substr(d1.id, 4) AS id,
officialname.value AS name,
datasourcetype.classname AS type,
dateofvalidation.value AS dateofvalidation,
date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation,
FALSE AS harvested,
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
d1.latitude.value AS latitude,
d1.longitude.value AS longitude,
d1.websiteurl.value AS websiteurl,
d1.openairecompatibility.classid AS compatibility,
d1.journal.issnprinted AS issn_printed,
d1.journal.issnonline AS issn_online
FROM ${openaire_db_name}.datasource d1
LEFT OUTER JOIN
(SELECT id, split(originalidd, '\\:')[1] as piwik_id
FROM ${openaire_db_name}.datasource
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
WHERE originalidd like "piwik:%") AS d2
ON d1.id = d2.id
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false;
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
-- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
INSERT INTO ${stats_db_name}.dual VALUES ('X');
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
SELECT 'other',
'Other',
'Repository',
NULL,
NULL,
false,
0,
NULL,
NULL,
NULL,
'unknown',
null,
null
FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual;
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge;
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource select /*+ COALESCE(100) */ substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources; /*EOS*/

View File

@ -1,22 +1,24 @@
set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
-- Organization table/view and Organization related tables/views -- Organization table/view and Organization related tables/views
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.organization purge; DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
SELECT substr(o.id, 4) as id, SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id,
o.legalname.value as name, o.legalname.value as name,
o.legalshortname.value as legalshortname, o.legalshortname.value as legalshortname,
o.country.classid as country o.country.classid as country
FROM ${openaire_db_name}.organization o FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
SELECT organization AS id, id AS datasource SELECT organization AS id, id AS datasource
FROM ${stats_db_name}.datasource_organizations; FROM ${stats_db_name}.datasource_organizations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS
SELECT id AS project, organization as id SELECT id AS project, organization as id
FROM ${stats_db_name}.project_organizations; FROM ${stats_db_name}.project_organizations; /*EOS*/

View File

@ -150,190 +150,367 @@
</decision> </decision>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}
${wf:actionData(wf:lastErrorNode())['stackTrace']}]</message>
</kill> </kill>
<action name="Step1"> <action name="Step1">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step1.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step1</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step2"/> <ok to="Step2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step2"> <action name="Step2">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step2.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step2</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step3"/> <ok to="Step3"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step3"> <action name="Step3">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step3.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step3</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step4"/> <ok to="Step4"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step4"> <action name="Step4">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step4.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step4</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step5"/> <ok to="Step5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step5"> <action name="Step5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step6"/> <ok to="Step6"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step6"> <action name="Step6">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step6.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step6</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step7"/> <ok to="Step7"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step7"> <action name="Step7">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step7.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step7</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step8"/> <ok to="Step8"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step8"> <action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step8.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step8</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step9"/> <ok to="Step9"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step9"> <action name="Step9">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step9.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step9</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step10"/> <ok to="Step10"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step10"> <action name="Step10">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step10.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step10</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<ok to="Step11"/> <ok to="Step11"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step11"> <action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step11.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step11</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<ok to="Step12"/> <ok to="Step12"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step12"> <action name="Step12">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step12.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step12</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step13"/> <ok to="Step13"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step13"> <action name="Step13">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step13.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step13</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step14"/> <ok to="Step14"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step14"> <action name="Step14">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step14.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step14</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15"/> <ok to="Step15"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step15"> <action name="Step15">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step15.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step15</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15_5"/> <ok to="Step15_5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step15_5"> <action name="Step15_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step15_5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step15_5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<param>external_stats_db_name=${external_stats_db_name}</param> <jar>dhp-stats-update-${projectVersion}.jar</jar>
</hive2> <spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<ok to="Contexts"/> <ok to="Contexts"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Contexts"> <action name="Contexts">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>contexts.sh</exec> <exec>contexts.sh</exec>
@ -380,29 +557,51 @@
</action> </action>
<action name="Step16_1-definitions"> <action name="Step16_1-definitions">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step16_1-definitions.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step16_1-definitions</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step16_5"/> <ok to="Step16_5"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step16_5"> <action name="Step16_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step16_5.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step16_5</name>
<param>openaire_db_name=${openaire_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step19-finalize"/> <ok to="Step19-finalize"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step19-finalize"> <action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec> <exec>finalizedb.sh</exec>
@ -415,7 +614,7 @@
</action> </action>
<action name="step20-createMonitorDB"> <action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>monitor.sh</exec> <exec>monitor.sh</exec>
@ -448,7 +647,7 @@
<!-- </action>--> <!-- </action>-->
<action name="step21-createObservatoryDB-pre"> <action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>observatory-pre.sh</exec> <exec>observatory-pre.sh</exec>
@ -462,18 +661,29 @@
</action> </action>
<action name="step21-createObservatoryDB"> <action name="step21-createObservatoryDB">
<hive2 xmlns="uri:oozie:hive2-action:0.1"> <spark xmlns="uri:oozie:spark-action:0.2">
<jdbc-url>${hive_jdbc_url}</jdbc-url> <master>yarn</master>
<script>scripts/step21-createObservatoryDB.sql</script> <mode>cluster</mode>
<param>stats_db_name=${stats_db_name}</param> <name>Step21-createObservatoryDB</name>
<param>observatory_db_name=${observatory_db_name}</param> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
</hive2> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--observatory_db_name</arg><arg>${observatory_db_name}</arg>
</spark>
<ok to="step21-createObservatoryDB-post"/> <ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="step21-createObservatoryDB-post"> <action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec> <exec>observatory-post.sh</exec>
@ -486,7 +696,7 @@
</action> </action>
<action name="step22-copyDataToImpalaCluster"> <action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec> <exec>copyDataToImpalaCluster.sh</exec>
@ -505,7 +715,7 @@
</action> </action>
<action name="step22a-createPDFsAggregated"> <action name="step22a-createPDFsAggregated">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>createPDFsAggregated.sh</exec> <exec>createPDFsAggregated.sh</exec>
@ -521,7 +731,7 @@
</action> </action>
<action name="step23-finalizeImpalaCluster"> <action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec> <exec>finalizeImpalaCluster.sh</exec>
@ -540,7 +750,7 @@
</action> </action>
<action name="Step24-updateCache"> <action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec> <exec>updateCache.sh</exec>