Update "dhp-stats-update" workflow to use "spark"-actions, instead of "hive" ones.

Note: Currently the code is set to only test the "Step1".
This commit is contained in:
Lampros Smyrnaios 2024-04-15 16:22:40 +03:00
parent d7da4f814b
commit db33f7727c
22 changed files with 627 additions and 382 deletions

View File

@ -0,0 +1,18 @@
# Install the whole "dnet-hadoop" project.
# Delete this module's previous build-files in order to avoid any conflicts.
rm -rf target/ ||
# Go to the root directory of this project.
cd ../../
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Install the project.
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
# We skip tests for all modules, since the take a big amount of time and some of them fail.
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.

View File

@ -0,0 +1,20 @@
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
cat ./target/extract-and-run-on-remote-host.log
# Check oozie workflow status
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
# Get the <job-ID> from the previous output and check the logs:
# yarn logs -applicationId <job-ID>

View File

@ -4,5 +4,5 @@
--------------------------------------------------------------
--------------------------------------------------------------
DROP database IF EXISTS ${stats_db_name} CASCADE;
CREATE database ${stats_db_name};
DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
CREATE database ${stats_db_name}; /*EOS*/

View File

@ -5,27 +5,27 @@
------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
SELECT *
FROM ${external_stats_db_name}.fundref;
FROM ${external_stats_db_name}.fundref; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.country AS
SELECT *
FROM ${external_stats_db_name}.country;
FROM ${external_stats_db_name}.country; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT *
FROM ${external_stats_db_name}.countrygdp;
FROM ${external_stats_db_name}.countrygdp; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT *
FROM ${external_stats_db_name}.roarmap;
FROM ${external_stats_db_name}.roarmap; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT *
FROM ${external_stats_db_name}.rndexpediture;
FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT *
FROM ${external_stats_db_name}.licenses_normalized;
FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
@ -33,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats;
select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats;
select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats;
select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats;
select * from openaire_prod_usage_stats.views_stats; /*EOS*/
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Creation date of the database
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge;
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date;
select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/

View File

@ -10,7 +10,7 @@ SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource);
WHERE d.id = rd.datasource); /*EOS*/
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp
@ -19,8 +19,8 @@ WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pr.result = r.id
AND r.type = 'publication');
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
AND r.type = 'publication'); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.stored purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id,
@ -63,7 +63,7 @@ FROM ${stats_db_name}.project_tmp p
AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2
ON prr2.id = p.id;
ON prr2.id = p.id; /*EOS*/
UPDATE ${stats_db_name}.publication_tmp
SET delayed = 'yes'
@ -73,7 +73,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.dataset_tmp
SET delayed = 'yes'
@ -83,7 +83,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.software_tmp
SET delayed = 'yes'
@ -93,7 +93,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes'
@ -103,7 +103,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result,
@ -116,4 +116,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project
WHERE result_projects.id = result.id
AND result.type = 'publication'
AND project.id = result_projects.project;
AND project.id = result_projects.project; /*EOS*/

View File

@ -1,42 +1,42 @@
------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT *
FROM ${stats_db_name}.datasource_tmp;
FROM ${stats_db_name}.datasource_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
SELECT *
FROM ${stats_db_name}.publication_tmp;
FROM ${stats_db_name}.publication_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
SELECT *
FROM ${stats_db_name}.dataset_tmp;
FROM ${stats_db_name}.dataset_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
SELECT *
FROM ${stats_db_name}.software_tmp;
FROM ${stats_db_name}.software_tmp; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp;
FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.project_tmp;
DROP TABLE ${stats_db_name}.datasource_tmp;
DROP TABLE ${stats_db_name}.publication_tmp;
DROP TABLE ${stats_db_name}.dataset_tmp;
DROP TABLE ${stats_db_name}.software_tmp;
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
----------------------------------------------
-- Re-creating views from final parquet tables
@ -54,4 +54,4 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct;
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/

View File

@ -5,7 +5,7 @@
-- Sources related tables/views
------------------------------------------------------
------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -16,9 +16,9 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -29,9 +29,9 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -42,9 +42,9 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -55,7 +55,7 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources
@ -64,9 +64,9 @@ SELECT * FROM ${stats_db_name}.dataset_sources
UNION ALL
SELECT * FROM ${stats_db_name}.software_sources
UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
@ -76,9 +76,9 @@ from (
LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
@ -91,9 +91,9 @@ where reltype='resultResult'
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
@ -108,9 +108,9 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4);
group by substr(target, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
@ -125,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4);
group by substr(source, 4); /*EOS*/

View File

@ -5,33 +5,33 @@
-- Licences related tables/views
------------------------------------------------------
------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses
@ -40,15 +40,15 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
UNION ALL
SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge;
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
@ -58,10 +58,10 @@ FROM (
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; /*EOS*/

View File

@ -6,7 +6,7 @@
------------------------------------------------------
------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -22,9 +22,9 @@ from (
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -40,9 +40,9 @@ from (
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -58,9 +58,9 @@ from (
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -76,7 +76,7 @@ from (
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
where peer_reviewed.id is null) pr; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed
@ -85,17 +85,17 @@ select * from ${stats_db_name}.dataset_refereed
union all
select * from ${stats_db_name}.software_refereed
union all
select * from ${stats_db_name}.otherresearchproduct_refereed;
select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge;
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads';
where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
@ -104,4 +104,4 @@ rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/

View File

@ -1,25 +1,25 @@
-------------------------------------------
--- Extra tables, mostly used by indicators
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count
from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id;
group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id;
group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as (
@ -33,17 +33,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
sum(case when rcount.type='software' then rcount.count else 0 end) as software,
sum(case when rcount.type='other' then rcount.count else 0 end) as other
from rcount
group by rcount.pid;
group by rcount.pid; /*EOS*/
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates;
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
create table if not exists ${stats_db_name}.result_instance stored as parquet as
select distinct r.*
@ -51,9 +51,9 @@ from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id;
join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select distinct r.id, r.amount, r.currency
@ -61,6 +61,6 @@ from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null;
where r.amount is not null; /*EOS*/
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/

View File

@ -3,26 +3,26 @@
----------------------------------------------------
-- Peer reviewed:
drop table if exists ${stats_db_name}.result_peerreviewed purge;
drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
-- Green OA:
drop table if exists ${stats_db_name}.result_greenoa purge;
drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
select r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
-- GOLD OA:
drop table if exists ${stats_db_name}.result_gold purge;
drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/

View File

@ -1,6 +1,6 @@
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
-- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp;
drop table if exists ${stats_db_name}.result_tmp; /*EOS*/
CREATE TABLE ${stats_db_name}.result_tmp (
id STRING,
@ -20,37 +20,37 @@ CREATE TABLE ${stats_db_name}.result_tmp (
peer_reviewed BOOLEAN,
green BOOLEAN,
gold BOOLEAN)
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.publication r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.dataset r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.software r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.otherresearchproduct r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp;
drop table if exists ${stats_db_name}.result; /*EOS*/
drop view if exists ${stats_db_name}.result; /*EOS*/
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/
drop table ${stats_db_name}.result_tmp; /*EOS*/

View File

@ -5,7 +5,7 @@
--------------------------------------------------------------
-- Publication temporary table
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_tmp
(
id STRING,
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.publication_tmp
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.publication_tmp
SELECT substr(p.id, 4) as id,
@ -39,17 +39,17 @@ SELECT substr(p.id, 4) as id,
case when size(p.description) > 0 then true else false end as abstract,
'publication' as type
from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case
@ -58,9 +58,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -71,44 +71,44 @@ FROM (
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@ from ${stats_db_name}.result r
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
from ${stats_db_name}.result_licenses rl
group by rl.id
) rln on rln.id=r.id;
) rln on rln.id=r.id; /*EOS*/
create table ${observatory_db_name}.result_affiliated_country stored as parquet as
@ -35,7 +35,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year stored as parquet as
@ -65,7 +65,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
@ -95,7 +95,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
@ -127,7 +127,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select
@ -158,7 +158,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select
@ -187,7 +187,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select
@ -216,7 +216,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select
@ -247,7 +247,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select
@ -278,7 +278,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_country stored as parquet as
select
@ -309,7 +309,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_year stored as parquet as
select
@ -340,7 +340,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
@ -372,7 +372,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select
@ -403,7 +403,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select
@ -434,7 +434,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select
@ -465,7 +465,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select
@ -496,7 +496,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select
@ -529,7 +529,7 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select
@ -562,4 +562,4 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/

View File

@ -5,7 +5,7 @@
------------------------------------------------------
-- Dataset temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_tmp
(
@ -23,7 +23,7 @@ CREATE TABLE ${stats_db_name}.dataset_tmp
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true');
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.dataset_tmp
SELECT substr(d.id, 4) AS id,
@ -40,26 +40,26 @@ SELECT substr(d.id, 4) AS id,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
'dataset' AS type
FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false;
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case
@ -68,9 +68,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
@ -82,35 +82,35 @@ FROM (
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@
--------------------------------------------------------
-- Software temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_tmp
(
id STRING,
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.software_tmp
abstract BOOLEAN,
type STRING
)
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.software_tmp
SELECT substr(s.id, 4) as id,
@ -39,24 +39,24 @@ SELECT substr(s.id, 4) as id,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'software' as type
from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case
@ -65,9 +65,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
@ -79,35 +79,35 @@ FROM (
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
select substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge;
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -5,7 +5,7 @@
--------------------------------------------------------------------------------
-- Otherresearchproduct temporary table supporting updates
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
(
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
source STRING,
abstract BOOLEAN,
type STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
SELECT substr(o.id, 4) AS id,
@ -39,23 +39,23 @@ SELECT substr(o.id, 4) AS id,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'other' AS type
FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false;
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/
-- Otherresearchproduct_citations
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false;
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case
@ -63,9 +63,9 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
@ -74,32 +74,32 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN(SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -3,38 +3,38 @@
-- Project table/view and Project related tables/views
------------------------------------------------------
------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/
create table ${stats_db_name}.project_classification STORED AS PARQUET as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null;
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_tmp
(
@ -61,7 +61,7 @@ CREATE TABLE ${stats_db_name}.project_tmp
totalcost FLOAT,
fundedamount FLOAT,
currency STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
INSERT INTO ${stats_db_name}.project_tmp
SELECT substr(p.id, 4) AS id,
@ -88,18 +88,18 @@ SELECT substr(p.id, 4) AS id,
p.fundedamount AS fundedamount,
p.currency.value AS currency
FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.funder purge;
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname,
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge;
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
@ -107,4 +107,4 @@ properties[0].value contribution, properties[1].value currency
from ${openaire_db_name}.relation r
LATERAL VIEW explode (r.properties) properties
where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%'
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/

View File

@ -16,7 +16,7 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset_tmp
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct_tmp;
FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
-- Views on final tables
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
@ -30,7 +30,7 @@ SELECT *
FROM ${stats_db_name}.dataset_datasources
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_datasources;
FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS
SELECT *
@ -43,7 +43,7 @@ SELECT *
FROM ${stats_db_name}.dataset_citations
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_citations;
FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS
SELECT *
@ -56,7 +56,7 @@ SELECT *
FROM ${stats_db_name}.dataset_classifications
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_classifications;
FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS
SELECT *
@ -69,7 +69,7 @@ SELECT *
FROM ${stats_db_name}.dataset_concepts
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_concepts;
FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS
SELECT *
@ -82,7 +82,7 @@ SELECT *
FROM ${stats_db_name}.dataset_languages
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_languages;
FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS
SELECT *
@ -95,7 +95,7 @@ SELECT *
FROM ${stats_db_name}.dataset_oids
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_oids;
FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS
SELECT *
@ -108,7 +108,7 @@ SELECT *
FROM ${stats_db_name}.dataset_pids
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_pids;
FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS
SELECT *
@ -121,9 +121,9 @@ SELECT *
FROM ${stats_db_name}.dataset_topics
UNION ALL
SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics;
FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/
create table ${stats_db_name}.result_fos stored as parquet as
with
@ -133,22 +133,22 @@ with
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization'
and r.target like '50|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge;
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/

View File

@ -5,7 +5,7 @@
-- Datasource table/view and Datasource related tables/views
------------------------------------------------------------
------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_tmp
(
@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp
`compatibility` STRING,
issn_printed STRING,
issn_online STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
-- Insert statement that takes into account the piwik_id of the openAIRE graph
INSERT INTO ${stats_db_name}.datasource_tmp
@ -46,14 +46,14 @@ FROM ${openaire_db_name}.datasource d1
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
WHERE originalidd like "piwik:%") AS d2
ON d1.id = d2.id
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false;
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/
INSERT INTO ${stats_db_name}.dual VALUES ('X');
INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
@ -71,42 +71,42 @@ SELECT 'other',
null,
null
FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual;
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/
DROP TABLE ${stats_db_name}.dual; /*EOS*/
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false;
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
-- datasource sources:
-- where the datasource info have been collected from.
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge;
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources;
FROM ${stats_db_name}.result_datasources; /*EOS*/

View File

@ -3,7 +3,7 @@
-- Organization table/view and Organization related tables/views
----------------------------------------------------------------
----------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.organization purge;
DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
SELECT substr(o.id, 4) as id,
@ -11,12 +11,12 @@ SELECT substr(o.id, 4) as id,
o.legalshortname.value as legalshortname,
o.country.classid as country
FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE;
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
SELECT organization AS id, id AS datasource
FROM ${stats_db_name}.datasource_organizations;
FROM ${stats_db_name}.datasource_organizations; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS
SELECT id AS project, organization as id
FROM ${stats_db_name}.project_organizations;
FROM ${stats_db_name}.project_organizations; /*EOS*/

View File

@ -154,180 +154,354 @@
</kill>
<action name="Step1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step1.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step2"/>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step1</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<!-- <ok to="Step2"/>-->
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="Step2">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step2.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step2</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step3"/>
<error to="Kill"/>
</action>
<action name="Step3">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step3.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step3</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step4"/>
<error to="Kill"/>
</action>
<action name="Step4">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step4.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step4</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step5"/>
<error to="Kill"/>
</action>
<action name="Step5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step5</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step6"/>
<error to="Kill"/>
</action>
<action name="Step6">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step6.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step6</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step7"/>
<error to="Kill"/>
</action>
<action name="Step7">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step7.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step7</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step8"/>
<error to="Kill"/>
</action>
<action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step8.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step8</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step9"/>
<error to="Kill"/>
</action>
<action name="Step9">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step9.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step9</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step10"/>
<error to="Kill"/>
</action>
<action name="Step10">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step10.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step10</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step11"/>
<error to="Kill"/>
</action>
<action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step11.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step11</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step12"/>
<error to="Kill"/>
</action>
<action name="Step12">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step12.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step12</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step13"/>
<error to="Kill"/>
</action>
<action name="Step13">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step13.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step13</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step14"/>
<error to="Kill"/>
</action>
<action name="Step14">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step14.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step14</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15"/>
<error to="Kill"/>
</action>
<action name="Step15">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step15.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step15</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step15_5"/>
<error to="Kill"/>
</action>
<action name="Step15_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step15_5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step15_5</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Contexts"/>
<error to="Kill"/>
</action>
@ -379,23 +553,45 @@
</action>
<action name="Step16_1-definitions">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step16_1-definitions.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step16_1-definitions</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step16_5"/>
<error to="Kill"/>
</action>
<action name="Step16_5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step16_5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step16_5</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step19-finalize"/>
<error to="Kill"/>
</action>
@ -461,12 +657,23 @@
</action>
<action name="step21-createObservatoryDB">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step21-createObservatoryDB.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>observatory_db_name=${observatory_db_name}</param>
</hive2>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step21-createObservatoryDB</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/>
</action>