From 76594ded23455ecf67addcbebd3b4ca45b73199e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 1 Dec 2023 13:38:19 +0200 Subject: [PATCH] Changes to indicators Fixes on open access colours indicators - indi_pub_green_oa - indi_pub_gold_oa - indi_pub_hybrid - indi_pub_bronze_oa - indi_pub_diamond --- .../scripts/step16-createIndicatorsTables.sql | 129 +++++++++++++----- .../scripts/step20-createMonitorDB.sql | 2 + .../scripts/step20-createMonitorDBAll.sql | 2 + .../graph/stats/oozie_app/scripts/step3.sql | 18 +++ .../graph/stats/oozie_app/scripts/step4.sql | 15 ++ .../graph/stats/oozie_app/scripts/step5.sql | 16 +++ .../graph/stats/oozie_app/scripts/step6.sql | 14 ++ 7 files changed, 162 insertions(+), 34 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 8180e6527..fea449de6 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,6 +1,18 @@ -- Sprint 1 ---- drop table if exists ${stats_db_name}.indi_pub_green_oa purge; +--create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as +--select distinct p.id, coalesce(green_oa, 0) as green_oa +--from ${stats_db_name}.publication p +-- left outer join ( +-- select p.id, 1 as green_oa +-- from ${stats_db_name}.publication p +-- join ${stats_db_name}.result_instance ri on ri.id = p.id +-- join ${stats_db_name}.datasource on datasource.id = ri.hostedby +-- where datasource.type like '%Repository%' +-- and (ri.accessright = 'Open Access' +-- or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp +-- on p.id= tmp.id; create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from ${stats_db_name}.publication p @@ -11,7 +23,7 @@ from ${stats_db_name}.publication p join ${stats_db_name}.datasource on datasource.id = ri.hostedby where datasource.type like '%Repository%' and (ri.accessright = 'Open Access' - or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp + or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and datasource.name!='Other') tmp on p.id= tmp.id; drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; @@ -183,15 +195,24 @@ drop table if exists ${stats_db_name}.tmp purge; ---- Sprint 4 ---- drop table if exists ${stats_db_name}.indi_pub_diamond purge; +--create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as +--select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal +--from ${stats_db_name}.publication_datasources pd +-- left outer join ( +-- select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd +-- join ${stats_db_name}.datasource d on d.id=pd.datasource +-- join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) +-- and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp +-- on pd.id=tmp.id; + create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from ${stats_db_name}.publication_datasources pd - left outer join ( - select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd - join ${stats_db_name}.datasource d on d.id=pd.datasource - join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) - and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp - on pd.id=tmp.id; +left outer join (select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd +join ${stats_db_name}.datasource d on d.id=pd.datasource +join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) +and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp +on pd.id=tmp.id; drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; @@ -312,28 +333,55 @@ drop table if exists ${stats_db_name}.indi_pub_gold_oa purge; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp -- on pd.id=tmp.id; +--create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as +--with gold_oa as ( +--SELECT issn,issn_l from stats_ext.issn_gold_oa_dataset_v5), +--issn AS (SELECT * FROM +--(SELECT id,issn_printed as issn FROM ${stats_db_name}.datasource +--WHERE issn_printed IS NOT NULL +--UNION ALL +--SELECT id, issn_online as issn FROM ${stats_db_name}.datasource +--WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn +--WHERE LENGTH(issn) > 7), +--alljournals AS(select issn, issn_l from stats_ext.alljournals +--where journal_is_in_doaj=true or journal_is_oa=true) +--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +--FROM ${stats_db_name}.publication_datasources pd +--left outer join ( +--select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd +--JOIN issn on issn.id=pd.datasource +--JOIN gold_oa on issn.issn = gold_oa.issn +--join alljournals on issn.issn=alljournals.issn +--left outer join ${stats_db_name}.result_instance ri on ri.id=pd.id +--and ri.accessright!='Closed Access' and ri.accessright_uw='gold') tmp +--on pd.id=tmp.id; create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as with gold_oa as ( -SELECT issn,issn_l from stats_ext.issn_gold_oa_dataset_v5), -issn AS (SELECT * FROM -(SELECT id,issn_printed as issn FROM ${stats_db_name}.datasource -WHERE issn_printed IS NOT NULL -UNION ALL -SELECT id, issn_online as issn FROM ${stats_db_name}.datasource -WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn -WHERE LENGTH(issn) > 7), -alljournals AS(select issn, issn_l from stats_ext.alljournals -where journal_is_in_doaj=true or journal_is_oa=true) +select distinct issn from ( + SELECT issn_l as issn from stats_ext.issn_gold_oa_dataset_v5 + UNION ALL + SELECT issn as issn from stats_ext.issn_gold_oa_dataset_v5 + UNION ALL + select issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true + UNION ALL + select issn_l as issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true) foo), +dd as ( +select distinct * from ( + select id, issn_printed as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' + UNION ALL + select id, issn_online as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' + UNION ALL + select id, issn_printed as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_printed + UNION ALL + select id, issn_online as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_online) foo +) SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold FROM ${stats_db_name}.publication_datasources pd left outer join ( -select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd -JOIN issn on issn.id=pd.datasource -JOIN gold_oa on issn.issn = gold_oa.issn -join alljournals on issn.issn=alljournals.issn -left outer join ${stats_db_name}.result_instance ri on ri.id=pd.id -and ri.accessright!='Closed Access' and ri.accessright_uw='gold') tmp -on pd.id=tmp.id; + select pd.id, 1 as is_gold + FROM ${stats_db_name}.publication_datasources pd + join dd on dd.id=pd.datasource + left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; @@ -421,15 +469,26 @@ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; -- where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp -- on pd.id=tmp.id; +--create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as +--select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd +--left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd +--join ${stats_db_name}.datasource d on pd.datasource=d.id +--join ${stats_db_name}.result_instance ri on ri.id=pd.id +--join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id +--join ${stats_db_name}.result_accessroute ra on ra.id=pd.id +--where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' +--or indi_gold.is_gold=0) and (ra.accessroute='hybrid' or ri.license is not null)) tmp +--on pd.id=tmp.id; + create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd -left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd -join ${stats_db_name}.datasource d on pd.datasource=d.id +select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication pd +left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication pd join ${stats_db_name}.result_instance ri on ri.id=pd.id join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id join ${stats_db_name}.result_accessroute ra on ra.id=pd.id -where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' -or indi_gold.is_gold=0) and (ra.accessroute='hybrid' or ri.license is not null)) tmp +join ${stats_db_name}.datasource d on d.id=ri.hostedby +where indi_gold.is_gold=0 and ((d.type like '%Journal%' and ri.accessright!='Closed Access' and ri.accessright!='Restricted' and ri.license is not null) or +ra.accessroute='hybrid'))tmp on pd.id=tmp.id; drop table if exists ${stats_db_name}.indi_org_fairness purge; @@ -814,14 +873,16 @@ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; --and ri.accessright='Open Access') tmp on tmp.id=p.id; create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as -select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication_datasources pd -left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication_datasources pd -join ${stats_db_name}.datasource d on pd.datasource=d.id +select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication pd +left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication pd join ${stats_db_name}.result_instance ri on ri.id=pd.id join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id +join ${stats_db_name}.indi_pub_hybrid indi_hybrid on indi_hybrid.id=pd.id join ${stats_db_name}.result_accessroute ra on ra.id=pd.id -where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' -or indi_gold.is_gold=0) and (ra.accessroute='bronze' or ri.license is null)) tmp +join ${stats_db_name}.datasource d on d.id=ri.hostedby +where indi_gold.is_gold=0 and indi_hybrid.is_hybrid=0 +and ((d.type like '%Journal%' and ri.accessright!='Closed Access' +and ri.accessright!='Restricted' and ri.license is null) or ra.accessroute='bronze')) tmp on pd.id=tmp.id; CREATE TEMPORARY TABLE ${stats_db_name}.project_year_result_year as diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index b52abd865..c61a19e5c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -64,6 +64,8 @@ create table TARGET.result_accessroute stored as parquet as select * from SOURCE create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); + create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index 2b6a68514..167aac726 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -248,6 +248,8 @@ create table TARGET.indi_impact_measures stored as parquet as select * from SOUR create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index eb97263a7..0384de4ec 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,6 +5,8 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; + CREATE TABLE ${stats_db_name}.dataset_tmp ( id STRING, @@ -40,6 +42,8 @@ SELECT substr(d.id, 4) AS id, FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; + CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d @@ -47,12 +51,16 @@ FROM ${openaire_db_name}.dataset d WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; + CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; + CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id @@ -62,6 +70,8 @@ from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; + CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( @@ -74,23 +84,31 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; + CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; + CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; + CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; + CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0d1f6323e..d8f4d65e4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,6 +5,7 @@ -------------------------------------------------------- -- Software temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; CREATE TABLE ${stats_db_name}.software_tmp ( id STRING, @@ -40,6 +41,8 @@ SELECT substr(s.id, 4) as id, from ${openaire_db_name}.software s where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; + CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s @@ -47,6 +50,8 @@ FROM ${openaire_db_name}.software s where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; + CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p @@ -62,6 +67,8 @@ FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; + CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( @@ -74,23 +81,31 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; + CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; + CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; + CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; + CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 06b616d6a..fae0fbb63 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,6 +5,8 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, @@ -40,6 +42,8 @@ FROM ${openaire_db_name}.otherresearchproduct o WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; -- Otherresearchproduct_citations +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation @@ -51,6 +55,8 @@ SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id @@ -59,6 +65,8 @@ SELECT substr(p.id, 4) as id, case FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -68,21 +76,29 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index e0522e149..e5b3f504e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,29 +3,39 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; + CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; + CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; + CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; + create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; +DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; + CREATE TABLE ${stats_db_name}.project_tmp ( id STRING, @@ -80,12 +90,16 @@ SELECT substr(p.id, 4) AS id, FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.funder purge; + create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; +DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; + CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, properties[0].value contribution, properties[1].value currency