Improve performance and efficiency by rewriting the creation process of "publication", "project", "dataset", "datasource", "software", "otherresearchproduct" and "result" tables, to be performed in a single query, for each one.

2024-07-03 13:03:15 +03:00 · 2024-07-03 13:03:15 +03:00 · 54e11b6a43
parent fe2275a9b0
commit 54e11b6a43
10 changed files with 248 additions and 458 deletions
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql
@ -4,108 +4,6 @@
 ----------------------------------------------------------------
 ----------------------------------------------------------------
 --Datasource temporary table updates
 UPDATE ${stats_db_name}.datasource_tmp
 SET harvested='true'
 WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
                            FROM ${stats_db_name}.datasource_tmp d,
                                 ${stats_db_name}.result_datasources rd
                            WHERE d.id = rd.datasource); -- /*EOS*/
 -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
 UPDATE ${stats_db_name}.project_tmp
 SET haspubs='yes'
 WHERE project_tmp.id IN (SELECT pr.id
                         FROM ${stats_db_name}.project_results pr,
                              ${stats_db_name}.result r
                         WHERE pr.result = r.id
                           AND r.type = 'publication'); -- /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.project purge; -- /*EOS*/
 CREATE TABLE ${stats_db_name}.project stored as parquet as
 SELECT p.id,
       p.acronym,
       p.title,
       p.funder,
       p.funding_lvl0,
       p.funding_lvl1,
       p.funding_lvl2,
       p.ec39,
       p.type,
       p.startdate,
       p.enddate,
       p.start_year,
       p.end_year,
       p.duration,
       CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END            AS haspubs,
       CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END             AS numpubs,
       CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
       CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END             AS delayedpubs,
       p.callidentifier,
       p.code,
       p.totalcost,
       p.fundedamount,
       p.currency
 FROM ${stats_db_name}.project_tmp p
         LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
                    FROM ${stats_db_name}.project_results pr
                             INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
                    WHERE r.type = 'publication'
                    GROUP BY pr.id) AS prr1 on prr1.id = p.id
         LEFT JOIN (SELECT pp.id,
                           max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
                           count(distinct r.id)                                AS dp
                    FROM ${stats_db_name}.project_tmp pp,
                         ${stats_db_name}.project_results pr,
                         ${stats_db_name}.result r
                    WHERE pp.id = pr.id
                      AND pr.result = r.id
                      AND r.type = 'publication'
                      AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
                    GROUP BY pp.id) AS prr2
                   ON prr2.id = p.id; -- /*EOS*/
 UPDATE ${stats_db_name}.publication_tmp
 SET delayed = 'yes'
 WHERE publication_tmp.id IN (SELECT distinct r.id
                             FROM ${stats_db_name}.result r,
                                  ${stats_db_name}.project_results pr,
                                  ${stats_db_name}.project_tmp p
                             WHERE r.id = pr.result
                               AND pr.id = p.id
                               AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
 UPDATE ${stats_db_name}.dataset_tmp
 SET delayed = 'yes'
 WHERE dataset_tmp.id IN (SELECT distinct r.id
                         FROM ${stats_db_name}.result r,
                              ${stats_db_name}.project_results pr,
                              ${stats_db_name}.project_tmp p
                         WHERE r.id = pr.result
                           AND pr.id = p.id
                           AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
 UPDATE ${stats_db_name}.software_tmp
 SET delayed = 'yes'
 WHERE software_tmp.id IN (SELECT distinct r.id
                          FROM ${stats_db_name}.result r,
                               ${stats_db_name}.project_results pr,
                               ${stats_db_name}.project_tmp p
                          WHERE r.id = pr.result
                            AND pr.id = p.id
                            AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
 UPDATE ${stats_db_name}.otherresearchproduct_tmp
 SET delayed = 'yes'
 WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
                                      FROM ${stats_db_name}.result r,
                                           ${stats_db_name}.project_results pr,
                                           ${stats_db_name}.project_tmp p
                                      WHERE r.id = pr.result
                                        AND pr.id = p.id
                                        AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/
 CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
 SELECT result_projects.id          AS result,
       result_projects.project     AS project_results,
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql
@ -1,42 +1,4 @@
------------------------------------------------------------------------------------------------------
+set mapred.job.queue.name=analytics; /*EOS*/
 -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
 ------------------------------------------------------------------------------------------------------
 DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
 SELECT *
 FROM ${stats_db_name}.datasource_tmp; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
 SELECT *
 FROM ${stats_db_name}.publication_tmp; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
 SELECT *
 FROM ${stats_db_name}.dataset_tmp; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.software stored AS parquet AS
 SELECT *
 FROM ${stats_db_name}.software_tmp; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
 SELECT *
 FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/
 DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
 ----------------------------------------------
 -- Re-creating views from final parquet tables
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
@ -1,58 +1,26 @@
 set mapred.job.queue.name=analytics; /*EOS*/
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
+-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
 -- peer reviewed)
 drop table if exists ${stats_db_name}.result_tmp; /*EOS*/
 CREATE TABLE ${stats_db_name}.result_tmp (
    id STRING,
    title STRING,
    publisher STRING,
    journal STRING,
    `date` STRING,
    `year` INT,
    bestlicence STRING,
    access_mode STRING,
    embargo_end_date STRING,
    delayed BOOLEAN,
    authors INT,
    source STRING,
    abstract BOOLEAN,
    type STRING ,
    peer_reviewed BOOLEAN,
    green BOOLEAN,
    gold BOOLEAN)
 clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/
 insert into ${stats_db_name}.result_tmp
 select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
 FROM ${stats_db_name}.publication r
 LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
 insert into ${stats_db_name}.result_tmp
 select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
 FROM ${stats_db_name}.dataset r
 LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
 insert into ${stats_db_name}.result_tmp
 select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
 FROM ${stats_db_name}.software r
 LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
 insert into ${stats_db_name}.result_tmp
 select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
 FROM ${stats_db_name}.otherresearchproduct r
 LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
 drop table if exists ${stats_db_name}.result; /*EOS*/
 drop view if exists ${stats_db_name}.result; /*EOS*/
-create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/
+drop table if exists ${stats_db_name}.result; /*EOS*/
-drop table ${stats_db_name}.result_tmp; /*EOS*/
+
 CREATE TABLE ${stats_db_name}.result stored as parquet as
 SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
 FROM (
    (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
        FROM ${stats_db_name}.publication)
    UNION ALL
    (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
        FROM ${stats_db_name}.dataset)
    UNION ALL
    (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
        FROM ${stats_db_name}.software)
    UNION ALL
    (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
        FROM ${stats_db_name}.otherresearchproduct)
    ) r
 LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
 LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@ -7,41 +7,41 @@ set mapred.job.queue.name=analytics; /*EOS*/
 --------------------------------------------------------------
 -- Publication temporary table
-DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.publication_tmp
 (
    id               STRING,
    title            STRING,
    publisher        STRING,
    journal          STRING,
    date             STRING,
    year             STRING,
    bestlicence      STRING,
    embargo_end_date STRING,
    delayed          BOOLEAN,
    authors          INT,
    source           STRING,
    abstract         BOOLEAN,
    type             STRING
 )
    clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/
-INSERT INTO ${stats_db_name}.publication_tmp
+CREATE TABLE ${stats_db_name}.publication stored as parquet as
-SELECT substr(p.id, 4)                                            as id,
+with pub_pr as (
-       p.title[0].value                                           as title,
+    select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
-       p.publisher.value                                          as publisher,
+    from ${openaire_db_name}.publication pub
-       p.journal.name                                             as journal,
+             join ${openaire_db_name}.relation rel
-       p.dateofacceptance.value                                   as date,
+                  on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
-       date_format(p.dateofacceptance.value, 'yyyy')              as year,
+                      and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
-       p.bestaccessright.classname                                as bestlicence,
+             join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
-       p.embargoenddate.value                                     as embargo_end_date,
+    where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
-       false                                                      as delayed,
+),
-       size(p.author)                                             as authors,
+ pub_delayed as (
-       concat_ws('\u003B', p.source.value)                        as source,
+     select pub_id, max(delayed) as delayed
-       case when size(p.description) > 0 then true else false end as abstract,
+     from pub_pr
     group by pub_id
 )
 select /*+ COALESCE(100) */
    substr(pub.id, 4)                                                     as id,
    pub.title[0].value                                                    as title,
    pub.publisher.value                                                   as publisher,
    pub.journal.name                                                      as journal,
    pub.dateofacceptance.value                                            as date,
    date_format(pub.dateofacceptance.value, 'yyyy')                       as year,
    pub.bestaccessright.classname                                         as bestlicence,
    pub.embargoenddate.value                                              as embargo_end_date,
    coalesce(pub_delayed.delayed, false)                                  as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
    size(pub.author)                                                      as authors,
    concat_ws('\u003B', pub.source.value)                                 as source,
    case when size(pub.description) > 0 then true else false end          as abstract,
    'publication'                                                         as type
-from ${openaire_db_name}.publication p
+from ${openaire_db_name}.publication pub
-where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
+    left outer join pub_delayed on pub.id=pub_delayed.pub_id
 where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@ -5,42 +5,41 @@
 ------------------------------------------------------
 -- Dataset temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
-CREATE TABLE ${stats_db_name}.dataset_tmp
+CREATE TABLE ${stats_db_name}.dataset stored as parquet as
-(
+with datast_pr as (
-    id               STRING,
+    select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
-    title            STRING,
+    from ${openaire_db_name}.dataset datast
-    publisher        STRING,
+        join ${openaire_db_name}.relation rel
-    journal          STRING,
+            on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id
-    date             STRING,
+                and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
-    year             STRING,
+    join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
-    bestlicence      STRING,
+    where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false
-    embargo_end_date STRING,
+),
-    delayed          BOOLEAN,
+datast_delayed as (
-    authors          INT,
+    select datast_id, max(delayed) as delayed
-    source           STRING,
+    from datast_pr
-    abstract         BOOLEAN,
+    group by datast_id
    type             STRING
 )
-    clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
+select /*+ COALESCE(100) */
    substr(datast.id, 4)                                                  as id,
    datast.title[0].value                                                 as title,
    datast.publisher.value                                                as publisher,
    cast(null as string)                                                  as journal,
    datast.dateofacceptance.value                                         as date,
    date_format(datast.dateofacceptance.value, 'yyyy')                    as year,
    datast.bestaccessright.classname                                      as bestlicence,
    datast.embargoenddate.value                                           as embargo_end_date,
    coalesce(datast_delayed.delayed, false)                               as delayed, -- It's delayed, when the dataset was published after the end of the project.
    size(datast.author)                                                   as authors,
    concat_ws('\u003B', datast.source.value)                              as source,
    case when size(datast.description) > 0 then true else false end       as abstract,
    'dataset'                                                             as type
 from ${openaire_db_name}.dataset datast
    left outer join datast_delayed on datast.id=datast_delayed.datast_id
 where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/
 INSERT INTO ${stats_db_name}.dataset_tmp
 SELECT substr(d.id, 4)                                            AS id,
       d.title[0].value                                           AS title,
       d.publisher.value                                          AS publisher,
       cast(null AS string)                                       AS journal,
       d.dateofacceptance.value                                   as date,
       date_format(d.dateofacceptance.value, 'yyyy')              AS year,
       d.bestaccessright.classname                                AS bestlicence,
       d.embargoenddate.value                                     AS embargo_end_date,
       false                                                      AS delayed,
       size(d.author)                                             AS authors,
       concat_ws('\u003B', d.source.value)                        AS source,
       CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
       'dataset'                                                  AS type
 FROM ${openaire_db_name}.dataset d
 WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@ -5,41 +5,41 @@
 --------------------------------------------------------
 -- Software temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
 CREATE TABLE ${stats_db_name}.software_tmp
 (
    id               STRING,
    title            STRING,
    publisher        STRING,
    journal          STRING,
    date             STRING,
    year             STRING,
    bestlicence      STRING,
    embargo_end_date STRING,
    delayed          BOOLEAN,
    authors          INT,
    source           STRING,
    abstract         BOOLEAN,
    type             STRING
 )
    clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
-INSERT INTO ${stats_db_name}.software_tmp
+CREATE TABLE ${stats_db_name}.software stored as parquet as
-SELECT substr(s.id, 4)                                            as id,
+with soft_pr as (
-       s.title[0].value                                           AS title,
+    select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
-       s.publisher.value                                          AS publisher,
+    from ${openaire_db_name}.software soft
-       CAST(NULL AS string)                                       AS journal,
+        join ${openaire_db_name}.relation rel
-       s.dateofacceptance.value                                   AS DATE,
+            on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id
-       date_format(s.dateofacceptance.value, 'yyyy')              AS YEAR,
+                and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
-       s.bestaccessright.classname                                AS bestlicence,
+        join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
-       s.embargoenddate.value                                     AS embargo_end_date,
+    where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false
-       FALSE                                                      AS delayed,
+),
-       SIZE(s.author)                                             AS authors,
+soft_delayed as (
-       concat_ws('\u003B', s.source.value)                        AS source,
+    select soft_id, max(delayed) as delayed
-       CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
+    from soft_pr
    group by soft_id
 )
 select /*+ COALESCE(100) */
    substr(soft.id, 4)                                                       as id,
    soft.title[0].value                                                      as title,
    soft.publisher.value                                                     as publisher,
    cast(null as string)                                                     as journal,
    soft.dateofacceptance.value                                              as date,
    date_format(soft.dateofacceptance.value, 'yyyy')                         as year,
    soft.bestaccessright.classname                                           as bestlicence,
    soft.embargoenddate.value                                                as embargo_end_date,
    coalesce(soft_delayed.delayed, false)                                    as delayed, -- It's delayed, when the software was published after the end of the project.
    size(soft.author)                                                        as authors,
    concat_ws('\u003B', soft.source.value)                                   as source,
    case when size(soft.description) > 0 then true else false end            as abstract,
    'software'                                                               as type
-from ${openaire_db_name}.software s
+from ${openaire_db_name}.software soft
-where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
+         left outer join soft_delayed on soft.id=soft_delayed.soft_id
 where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@ -5,41 +5,41 @@
 --------------------------------------------------------------------------------
 -- Otherresearchproduct temporary table supporting updates
-DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
-CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
+CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as
-(
+with other_pr as (
-    id               STRING,
+    select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
-    title            STRING,
+    from ${openaire_db_name}.otherresearchproduct other
-    publisher        STRING,
+    join ${openaire_db_name}.relation rel
-    journal          STRING,
+        on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id
-    date             STRING,
+            and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
-    year             STRING,
+    join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
-    bestlicence      STRING,
+    where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false
-    embargo_end_date STRING,
+),
-    delayed          BOOLEAN,
+other_delayed as (
-    authors          INT,
+    select other_id, max(delayed) as delayed
-    source           STRING,
+    from other_pr
-    abstract         BOOLEAN,
+    group by other_id
-    type             STRING
+)
-) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
+select /*+ COALESCE(100) */
    substr(other.id, 4)                                            as id,
    other.title[0].value                                           as title,
    other.publisher.value                                          as publisher,
    cast(null as string)                                           as journal,
    other.dateofacceptance.value                                   as date,
    date_format(other.dateofacceptance.value, 'yyyy')              as year,
    other.bestaccessright.classname                                as bestlicence,
    other.embargoenddate.value                                     as embargo_end_date,
    false                                                          as delayed,
    size(other.author)                                             as authors,
    concat_ws('\u003B', other.source.value)                        as source,
    case when size(other.description) > 0 then true else false end as abstract,
    'other'                                                        as type
 from ${openaire_db_name}.otherresearchproduct other
    left outer join other_delayed on other.id=other_delayed.other_id
 where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/
 INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
 SELECT substr(o.id, 4)                                            AS id,
       o.title[0].value                                           AS title,
       o.publisher.value                                          AS publisher,
       CAST(NULL AS string)                                       AS journal,
       o.dateofacceptance.value                                   AS DATE,
       date_format(o.dateofacceptance.value, 'yyyy')              AS year,
       o.bestaccessright.classname                                AS bestlicence,
       o.embargoenddate.value                                     as embargo_end_date,
       FALSE                                                      AS delayed,
       SIZE(o.author)                                             AS authors,
       concat_ws('\u003B', o.source.value)                        AS source,
       CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
       'other'                                                    AS type
 FROM ${openaire_db_name}.otherresearchproduct o
 WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/
 -- Otherresearchproduct_citations
 DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@ -34,61 +34,69 @@ from ${openaire_db_name}.project p
    lateral view explode(p.h2020classification) classifs as class
 where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
-DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/
-CREATE TABLE ${stats_db_name}.project_tmp
+CREATE TABLE ${stats_db_name}.project stored as parquet as
-(
+with pr_pub as (
-    id             STRING,
+    select pr.id as pr_id, pub.id as pub_id,
-    acronym        STRING,
+        (case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed,
-    title          STRING,
+        max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub
-    funder         STRING,
+    from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication
-    funding_lvl0   STRING,
+        where datainfo.deletedbyinference = false and datainfo.invisible = false) pub
-    funding_lvl1   STRING,
+    join ${openaire_db_name}.relation rel
-    funding_lvl2   STRING,
+        on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id
-    ec39           STRING,
+            and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
-    type           STRING,
+    join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project
-    startdate      STRING,
+            where datainfo.deletedbyinference = false and datainfo.invisible = false) pr
-    enddate        STRING,
+        on pr.id=rel.target
-    start_year     INT,
+    group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate
-    end_year       INT,
+),
-    duration       INT,
+num_pubs_pr as (
-    haspubs        STRING,
+    select pr_id, count( distinct pub_id) as num_pubs
-    numpubs        INT,
+    from pr_pub
-    daysforlastpub INT,
+    group by pr_id
-    delayedpubs    INT,
+),
-    callidentifier STRING,
+pub_delayed as (
-    code           STRING,
+    select pr_id, pub_id, max(delayed) as delayed
-    totalcost       FLOAT,
+    from pr_pub
-    fundedamount    FLOAT,
+    group by pr_id, pub_id
-    currency        STRING
+),
-) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/
+num_pub_delayed as (
    select pr_id, count(distinct pub_id) as num_delayed
    from pub_delayed
    where delayed
    group by pr_id
 )
 select /*+ COALESCE(100) */
    substr(p.id, 4)                                                             as id,
    p.acronym.value                                                             as acronym,
    p.title.value                                                               as title,
    xpath_string(p.fundingtree[0].value, '//funder/name')                       as funder,
    xpath_string(p.fundingtree[0].value, '//funding_level_0/name')              as funding_lvl0,
    xpath_string(p.fundingtree[0].value, '//funding_level_1/name')              as funding_lvl1,
    xpath_string(p.fundingtree[0].value, '//funding_level_2/name')              as funding_lvl2,
    p.ecsc39.value                                                              as ec39,
    p.contracttype.classname                                                    as type,
    p.startdate.value                                                           as startdate,
    p.enddate.value                                                             as enddate,
    year(p.startdate.value)                                                     as start_year,
    year(p.enddate.value)                                                       as end_year,
    cast(months_between(p.enddate.value, p.startdate.value) as int)             as duration,
    case when pr_pub.pub_id is null then 'no' else 'yes' end                    as haspubs,
    num_pubs_pr.num_pubs                                                        as numpubs,
    pr_pub.daysForlastPub                                                       as daysForlastPub,
    npd.num_delayed                                                             as delayedpubs,
    p.callidentifier.value                                                      as callidentifier,
    p.code.value                                                                as code,
    p.totalcost                                                                 as totalcost,
    p.fundedamount                                                              as fundedamount,
    p.currency.value                                                            as currency
 from ${openaire_db_name}.project p
 left outer join pr_pub on pr_pub.pr_id = p.id
 left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id
 left outer join num_pub_delayed npd on npd.pr_id=p.id
 where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/
 INSERT INTO ${stats_db_name}.project_tmp
 SELECT substr(p.id, 4)                                                 AS id,
       p.acronym.value                                                 AS acronym,
       p.title.value                                                   AS title,
       xpath_string(p.fundingtree[0].value, '//funder/name')           AS funder,
       xpath_string(p.fundingtree[0].value, '//funding_level_0/name')  AS funding_lvl0,
       xpath_string(p.fundingtree[0].value, '//funding_level_1/name')  AS funding_lvl1,
       xpath_string(p.fundingtree[0].value, '//funding_level_2/name')  AS funding_lvl2,
       p.ecsc39.value                                                  AS ec39,
       p.contracttype.classname                                        AS type,
       p.startdate.value                                               AS startdate,
       p.enddate.value                                                 AS enddate,
       year(p.startdate.value)                                         AS start_year,
       year(p.enddate.value)                                           AS end_year,
       CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration,
       'no'                                                            AS haspubs,
       0                                                               AS numpubs,
       0                                                               AS daysforlastpub,
       0                                                               AS delayedpubs,
       p.callidentifier.value                                          AS callidentifier,
       p.code.value                                                    AS code,
       p.totalcost                                                     AS totalcost,
       p.fundedamount                                                  AS fundedamount,
       p.currency.value                                                AS currency
 FROM ${openaire_db_name}.project p
 WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@ -7,16 +7,16 @@
 -- Views on temporary tables that should be re-created in the end
 CREATE OR REPLACE VIEW ${stats_db_name}.result as
 SELECT *, bestlicence AS access_mode
-FROM ${stats_db_name}.publication_tmp
+FROM ${stats_db_name}.publication
 UNION ALL
 SELECT *, bestlicence AS access_mode
-FROM ${stats_db_name}.software_tmp
+FROM ${stats_db_name}.software
 UNION ALL
 SELECT *, bestlicence AS access_mode
-FROM ${stats_db_name}.dataset_tmp
+FROM ${stats_db_name}.dataset
 UNION ALL
 SELECT *, bestlicence AS access_mode
-FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/
+FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
 -- Views on final tables
 CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
@ -153,4 +153,4 @@ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
 select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
 FROM ${stats_db_name}.result r
         JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
-         JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/
+         JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
@ -5,81 +5,36 @@
 -- Datasource table/view and Datasource related tables/views
 ------------------------------------------------------------
 ------------------------------------------------------------
-DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; -- /*EOS*/
+DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
-CREATE TABLE ${stats_db_name}.datasource_tmp
+CREATE TABLE ${stats_db_name}.datasource stored as parquet as
-(
+with piwik_datasource as (
-    `id`               string,
+    select id, split(originalidd, '\\:')[1] as piwik_id
-    `name`             STRING,
+    from ${openaire_db_name}.datasource
-    `type`             STRING,
+             lateral view explode(originalid) temp as originalidd
-    `dateofvalidation` STRING,
+    where originalidd like "piwik:%"
-    `yearofvalidation` string,
+)
-    `harvested`        BOOLEAN,
+select /*+ COALESCE(100) */
-    `piwik_id`         INT,
+       substr(dtrce.id, 4)                                                                                                 as id,
-    `latitude`         STRING,
+       case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end              as name,
-    `longitude`        STRING,
+       dtrce.datasourcetype.classname                                                                                      as type,
-    `websiteurl`       STRING,
+       dtrce.dateofvalidation.value                                                                                        as dateofvalidation,
-    `compatibility`    STRING,
+       case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end    as yearofvalidation,
-    issn_printed       STRING,
+       case when res.d_id is null then false else true end                                                                 as harvested,
-    issn_online        STRING
+       case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end                                                 as piwik_id,
-) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/
+       dtrce.latitude.value                                                                                                as latitude,
       dtrce.longitude.value                                                                                               as longitude,
       dtrce.websiteurl.value                                                                                              as websiteurl,
       dtrce.openairecompatibility.classid                                                                                 as compatibility,
       dtrce.journal.issnprinted                                                                                           as issn_printed,
       dtrce.journal.issnonline                                                                                            as issn_online
 from ${openaire_db_name}.datasource dtrce
         left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id
         left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id
 where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/
 -- Insert statement that takes into account the piwik_id of the openAIRE graph
 INSERT INTO ${stats_db_name}.datasource_tmp
 SELECT substr(d1.id, 4)                                          AS id,
       officialname.value                                        AS name,
       datasourcetype.classname                                  AS type,
       dateofvalidation.value                                    AS dateofvalidation,
       date_format(d1.dateofvalidation.value, 'yyyy')            AS yearofvalidation,
       FALSE                                                     AS harvested,
       CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
       d1.latitude.value                                         AS latitude,
       d1.longitude.value                                        AS longitude,
       d1.websiteurl.value                                       AS websiteurl,
       d1.openairecompatibility.classid                          AS compatibility,
       d1.journal.issnprinted                                    AS issn_printed,
       d1.journal.issnonline                                    AS issn_online
 FROM ${openaire_db_name}.datasource d1
         LEFT OUTER JOIN
     (SELECT id, split(originalidd, '\\:')[1] as piwik_id
      FROM ${openaire_db_name}.datasource
               LATERAL VIEW EXPLODE(originalid) temp AS originalidd
      WHERE originalidd like "piwik:%") AS d2
     ON d1.id = d2.id
 WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- /*EOS*/
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
+DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
 -- Creating a temporary dual table that will be removed after the following insert
 DROP TABLE IF EXISTS ${stats_db_name}.dual purge; -- /*EOS*/
 CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); -- /*EOS*/
 INSERT INTO ${stats_db_name}.dual VALUES ('X'); -- /*EOS*/
 INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
                                             `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
 SELECT 'other',
       'Other',
       'Repository',
       NULL,
       NULL,
       false,
       0,
       NULL,
       NULL,
       NULL,
       'unknown',
       null,
       null
 FROM ${stats_db_name}.dual
 WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -- /*EOS*/
 DROP TABLE ${stats_db_name}.dual; -- /*EOS*/
 UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -- /*EOS*/
 UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -- /*EOS*/
 DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; -- /*EOS*/
 CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
 SELECT substr(d.id, 4) AS id, langs.languages AS language