From d46b78b65949a58447821e050ed0179a8173c404 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 18 Apr 2024 15:40:27 +0300 Subject: [PATCH] dhp-stats-update: - Set Steps 2-7 and 9 to limit the amount of files generated by Spark, from 8000, down to 100, to improve file-transfer and querying performance. - Allow the workflow to run up to Step10. The Step11 seems to have some issues even when using hive-action. --- .../oa/graph/stats/oozie_app/scripts/step2.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step3.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step4.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step5.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step6.sql | 12 ++++++------ .../oa/graph/stats/oozie_app/scripts/step7.sql | 9 ++++----- .../oa/graph/stats/oozie_app/scripts/step9.sql | 2 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 6 +++--- 8 files changed, 46 insertions(+), 47 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 4aa90b1a2..8ec663573 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -44,7 +44,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, instancetype.classname as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -52,7 +52,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -63,7 +63,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance @@ -76,14 +76,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS -select substr(p.id, 4) as id, p.language.classname as language +select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -91,7 +91,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -99,7 +99,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as -select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -107,7 +107,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 1ff4beadb..ebedb5dc5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -45,7 +45,7 @@ WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -54,7 +54,7 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -62,7 +62,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -73,7 +73,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS -SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.dataset p @@ -87,14 +87,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -102,7 +102,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -110,7 +110,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0cffff052..4957d8d2f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -44,7 +44,7 @@ where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS -SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -53,7 +53,7 @@ where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -61,7 +61,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -72,7 +72,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.software p @@ -86,14 +86,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS -select substr(p.id, 4) AS id, p.language.classname AS language +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -101,7 +101,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -109,7 +109,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index d742bcc2a..820ec4395 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -45,7 +45,7 @@ WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS -SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ @@ -53,14 +53,14 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -70,7 +70,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p @@ -81,27 +81,27 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 75ec7d69c..d2688ec07 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -6,14 +6,14 @@ DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS -SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ @@ -21,7 +21,7 @@ WHERE r.reltype = 'projectOrganization' and r.source like '40|%' DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ @@ -29,7 +29,7 @@ WHERE r.reltype = 'resultProject' and r.target like '40|%' DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ create table ${stats_db_name}.project_classification STORED AS PARQUET as -select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 +select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ @@ -93,7 +93,7 @@ WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ create table ${stats_db_name}.funder STORED AS PARQUET as -select distinct xpath_string(fund, '//funder/id') as id, +select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country @@ -102,7 +102,7 @@ from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fun DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS -SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, properties[0].value contribution, properties[1].value currency from ${openaire_db_name}.relation r LATERAL VIEW explode (r.properties) properties diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 2cc7c13c4..f3ab52004 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -130,7 +130,7 @@ with lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') -select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 +select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/ @@ -138,7 +138,7 @@ from lvl1 DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.target like '50|%' @@ -147,8 +147,7 @@ WHERE r.reltype = 'resultOrganization' DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS -select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance +select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ - + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index afde8160e..1d76b89a6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -6,7 +6,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS -SELECT substr(o.id, 4) as id, +SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 5c255a488..37d837e76 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -303,8 +303,7 @@ --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} - - + @@ -382,7 +381,8 @@ --openaire_db_name${openaire_db_name} --external_stats_db_name${external_stats_db_name} - + +