From 2c4dcc90ba1f0ef2f0baeeaca5468bc88c25f86f Mon Sep 17 00:00:00 2001 From: antleb Date: Sun, 14 Feb 2021 02:54:55 +0200 Subject: [PATCH] analyzing tables to produce stats --- .../graph/stats/oozie_app/scripts/step10.sql | 5 +++- .../graph/stats/oozie_app/scripts/step11.sql | 3 +++ .../graph/stats/oozie_app/scripts/step12.sql | 13 +++++++++++ .../graph/stats/oozie_app/scripts/step13.sql | 9 ++++++++ .../graph/stats/oozie_app/scripts/step14.sql | 13 +++++++++++ .../graph/stats/oozie_app/scripts/step15.sql | 9 ++++++++ .../graph/stats/oozie_app/scripts/step16.sql | 13 ++++++++++- .../stats/oozie_app/scripts/step16_5.sql | 5 +++- .../stats/oozie_app/scripts/step16_6.sql | 9 +++++++- .../graph/stats/oozie_app/scripts/step2.sql | 23 ++++++++++++++++--- .../graph/stats/oozie_app/scripts/step3.sql | 18 ++++++++++++++- .../graph/stats/oozie_app/scripts/step4.sql | 18 ++++++++++++++- .../graph/stats/oozie_app/scripts/step5.sql | 22 ++++++++++++++---- .../graph/stats/oozie_app/scripts/step6.sql | 15 ++++++++---- .../graph/stats/oozie_app/scripts/step7.sql | 7 ++++-- .../graph/stats/oozie_app/scripts/step8.sql | 9 +++++++- .../graph/stats/oozie_app/scripts/step9.sql | 3 +++ 17 files changed, 174 insertions(+), 20 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 46ff295f4..e2e37d3fc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -18,4 +18,7 @@ CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_sta -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date; \ No newline at end of file +create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date; + +ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 13e141459..25f9f09ae 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -42,3 +42,6 @@ UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id I UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project; + +ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 25439852e..bf3f1d615 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -36,3 +36,16 @@ CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS acces -- To see with Antonis if the following is needed and where it should be placed ------------------------------------------------------------------------------- CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country; + +ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 795770313..4c1ca0203 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -57,3 +57,12 @@ UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; + +ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datast_sources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 4a56b5d68..e05e648b4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -47,3 +47,16 @@ FROM ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; + +ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datast_licenses COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 60b37048b..4940a73da 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -34,3 +34,12 @@ union all select * from ${stats_db_name}.software_refereed union all select * from ${stats_db_name}.otherresearchproduct_refereed; + +ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datast_refereed COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql index 33849b960..a7e6c74e4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql @@ -77,4 +77,15 @@ join ${stats_db_name}.result_datasources rd on rd.id=r.id join ${stats_db_name}.datasource d on d.id=rd.datasource join ${stats_db_name}.datasource_organizations dor on dor.id=d.id join ${stats_db_name}.organization o on o.id=dor.organization -where o.country is not null and o.country!=''; \ No newline at end of file +where o.country is not null and o.country!=''; + +ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index f737c1ea6..52b7a58fa 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -52,4 +52,7 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; drop table if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; \ No newline at end of file +drop table ${stats_db_name}.result_tmp; + +ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql index ced7bbc11..273a12108 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql @@ -29,4 +29,11 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture \ No newline at end of file +create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture + +ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index ba0db25be..ccd3994b4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -5,8 +5,6 @@ -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp; - CREATE TABLE ${stats_db_name}.publication_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , @@ -41,4 +39,23 @@ CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; -- Publication_citations -CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and p.datainfo.deletedbyinference=false; \ No newline at end of file +CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and p.datainfo.deletedbyinference=false; + +ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index f69715a31..0100efe18 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,7 +5,6 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp; CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true'); INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal, @@ -34,3 +33,20 @@ CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; + +ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 2c4a625e1..b44bc08c6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,7 +5,6 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp; CREATE TABLE ${stats_db_name}.software_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal, @@ -34,3 +33,20 @@ CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oid CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; + +ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 1fa5df8cb..a8b099d8d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,7 +5,6 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp; CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal, @@ -23,9 +22,7 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT sub CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource -from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN -(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; +CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference=false; @@ -34,3 +31,20 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4 CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; + +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index b4745535d..0091caad1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -6,21 +6,17 @@ ------------------------------------------------------ ------------------------------------------------------ -- Project_oids Table -DROP TABLE IF EXISTS ${stats_db_name}.project_oids; CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; -- Project_organizations Table -DROP TABLE IF EXISTS ${stats_db_name}.project_organizations; CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization' and r.datainfo.deletedbyinference=false; -- Project_results Table -DROP TABLE IF EXISTS ${stats_db_name}.project_results; CREATE TABLE ${stats_db_name}.project_results AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result FROM ${openaire_db_name}.relation r WHERE r.reltype='resultProject' and r.datainfo.deletedbyinference=false; -- Project table ---------------- -- Creating and populating temporary Project table -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp; CREATE TABLE ${stats_db_name}.project_tmp (id STRING, acronym STRING, title STRING, funder STRING, funding_lvl0 STRING, funding_lvl1 STRING, funding_lvl2 STRING, ec39 STRING, type STRING, startdate STRING, enddate STRING, start_year INT, end_year INT, duration INT, haspubs STRING, numpubs INT, daysforlastpub INT, delayedpubs INT, callidentifier STRING, code STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, p.acronym.value AS acronym, p.title.value AS title, xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, p.ecsc39.value AS ec39, p.contracttype.classname AS type, p.startdate.value AS startdate, p.enddate.value AS enddate, year(p.startdate.value) AS start_year, year(p.enddate.value) AS end_year, CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, 'no' AS haspubs, 0 AS numpubs, 0 AS daysforlastpub, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference=false; @@ -28,3 +24,14 @@ INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, p.acronym create table ${stats_db_name}.funder as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund + +ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 36a4a8a49..8506657a6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -24,8 +24,11 @@ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * FROM ${stats_db_ CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * FROM ${stats_db_name}.publication_topics UNION ALL SELECT * FROM ${stats_db_name}.software_topics UNION ALL SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -DROP TABLE IF EXISTS ${stats_db_name}.result_organization; CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization' and r.datainfo.deletedbyinference=false; -DROP TABLE IF EXISTS ${stats_db_name}.result_projects; CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id=pr.result JOIN ${stats_db_name}.project_tmp p ON p.id=pr.id; + +ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 197047c8b..e74ecc768 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -55,4 +55,11 @@ create table if not exists ${stats_db_name}.datasource_sources AS select substr( CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; - +ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index a918e4de4..4fb9919c9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -10,3 +10,6 @@ FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALS CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations; CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations; + +ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS; +ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file