dhp-stats-update:

- Set Steps 2-7 and 9 to limit the amount of files generated by Spark, from 8000, down to 100, to improve file-transfer and querying performance.
- Allow the workflow to run up to Step10. The Step11 seems to have some issues even when using hive-action.
This commit is contained in:
Lampros Smyrnaios 2024-04-18 15:40:27 +03:00
parent 6f2ebb2a52
commit d46b78b659
8 changed files with 46 additions and 47 deletions

View File

@ -44,7 +44,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, instancetype.classname as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -52,7 +52,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
@ -63,7 +63,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
@ -76,14 +76,14 @@ FROM (
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select substr(p.id, 4) as id, p.language.classname as language select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -91,7 +91,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -99,7 +99,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -107,7 +107,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""

View File

@ -45,7 +45,7 @@ WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -54,7 +54,7 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -62,7 +62,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
@ -73,7 +73,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
@ -87,14 +87,14 @@ FROM (
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -102,7 +102,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -110,7 +110,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -44,7 +44,7 @@ where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -53,7 +53,7 @@ where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -61,7 +61,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
@ -72,7 +72,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
@ -86,14 +86,14 @@ FROM (
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
select substr(p.id, 4) AS id, p.language.classname AS language select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -101,7 +101,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
@ -109,7 +109,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -45,7 +45,7 @@ WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/
@ -53,14 +53,14 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, case SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
@ -70,7 +70,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
@ -81,27 +81,27 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/

View File

@ -6,14 +6,14 @@
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' and r.source like '40|%' WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
@ -21,7 +21,7 @@ WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' and r.target like '40|%' WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
@ -29,7 +29,7 @@ WHERE r.reltype = 'resultProject' and r.target like '40|%'
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/
create table ${stats_db_name}.project_classification STORED AS PARQUET as create table ${stats_db_name}.project_classification STORED AS PARQUET as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
@ -93,7 +93,7 @@ WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
create table ${stats_db_name}.funder STORED AS PARQUET as create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id, select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fund, '//funder/shortname') as shortname,
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
@ -102,7 +102,7 @@ from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fun
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
properties[0].value contribution, properties[1].value currency properties[0].value contribution, properties[1].value currency
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
LATERAL VIEW explode (r.properties) properties LATERAL VIEW explode (r.properties) properties

View File

@ -130,7 +130,7 @@ with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
from lvl1 from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/ join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/
@ -138,7 +138,7 @@ from lvl1
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization' WHERE r.reltype = 'resultOrganization'
and r.target like '50|%' and r.target like '50|%'
@ -147,8 +147,7 @@ WHERE r.reltype = 'resultOrganization'
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/

View File

@ -6,7 +6,7 @@
DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
SELECT substr(o.id, 4) as id, SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id,
o.legalname.value as name, o.legalname.value as name,
o.legalshortname.value as legalshortname, o.legalshortname.value as legalshortname,
o.country.classid as country o.country.classid as country

View File

@ -303,8 +303,7 @@
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg> <arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg> <arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark> </spark>
<!-- <ok to="Step8"/>--> <ok to="Step8"/>
<ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -382,7 +381,8 @@
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg> <arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg> <arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark> </spark>
<ok to="Step11"/> <!-- <ok to="Step11"/>-->
<ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>