diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java index 027bf0735..01d1b9f6a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java @@ -65,7 +65,13 @@ public class RunSQLSparkJob { for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) { log.info("executing: {}", statement); long startTime = System.currentTimeMillis(); - spark.sql(statement).show(); + try { + spark.sql(statement).show(); + } catch (Exception e) { + log.error("Error executing statement: {}", statement, e); + System.err.println("Error executing statement: " + statement + "\n" + e); + throw e; + } log .info( "executed in {}", diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java index ff6c2689a..5f212c242 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java @@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator { // id if (res == 0) { if (left instanceof OafEntity && right instanceof OafEntity) { - res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId()); + res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId()); } } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql index 321fba87a..ede8a18bf 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql @@ -1,79 +1,3 @@ ---drop database if exists TARGET cascade; ---create database if not exists TARGET; --- ---create view if not exists TARGET.category as select * from SOURCE.category; ---create view if not exists TARGET.concept as select * from SOURCE.concept; ---create view if not exists TARGET.context as select * from SOURCE.context; ---create view if not exists TARGET.country as select * from SOURCE.country; ---create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; ---create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; ---create view if not exists TARGET.funder as select * from SOURCE.funder; ---create view if not exists TARGET.fundref as select * from SOURCE.fundref; ---create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; ---create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; ---create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; ---create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; ---create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; ---create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; --- ---create table TARGET.result stored as parquet as --- select distinct * from ( --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( --- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" --- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council --- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? --- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University --- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade --- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki --- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho --- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid --- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen --- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens --- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot --- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University --- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark --- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin --- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt --- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven --- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape --- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute --- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University --- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg --- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) --- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr --- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw --- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly --- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete --- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus --- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras --- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki --- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank --- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech --- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University --- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona --- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University --- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia --- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University --- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje --- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan --- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork --- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University --- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech --- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town --- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin --- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology --- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba --- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili --- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University --- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique --- ) )) foo; --- ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; - create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql index 35ab42029..064b5425b 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql @@ -81,7 +81,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ))) foo; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql index 5ab8c88b5..a3f29a9e3 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql @@ -61,7 +61,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ))) foo; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/installProject.sh b/dhp-workflows/dhp-stats-update/installProject.sh new file mode 100755 index 000000000..afd95578d --- /dev/null +++ b/dhp-workflows/dhp-stats-update/installProject.sh @@ -0,0 +1,18 @@ +# Install the whole "dnet-hadoop" project. + +# Delete this module's previous build-files in order to avoid any conflicts. +rm -rf target/ || + +# Go to the root directory of this project. +cd ../../ + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Install the project. +mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true + +# We skip tests for all modules, since the take a big amount of time and some of them fail. +# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script. diff --git a/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh new file mode 100755 index 000000000..2f2fc29d5 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh @@ -0,0 +1,20 @@ +# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file. + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Build and deploy this module. +mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \ + -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats + +# Show the Oozie-job-ID. +echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n" +cat ./target/extract-and-run-on-remote-host.log + +# Check oozie workflow status +# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info + +# Get the from the previous output and check the logs: +# yarn logs -applicationId application_ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql index 9697a1dc8..4551d6282 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql @@ -1,8 +1,10 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------------- -------------------------------------------------------------- -- Stats database creation -------------------------------------------------------------- -------------------------------------------------------------- -DROP database IF EXISTS ${stats_db_name} CASCADE; -CREATE database ${stats_db_name}; +DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/ +CREATE database ${stats_db_name}; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index bbd7b3bbc..48d8961ff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) @@ -5,27 +7,27 @@ ------------------------------------------------------------------------------------------------ CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * -FROM ${external_stats_db_name}.fundref; +FROM ${external_stats_db_name}.fundref; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * -FROM ${external_stats_db_name}.country; +FROM ${external_stats_db_name}.country; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * -FROM ${external_stats_db_name}.countrygdp; +FROM ${external_stats_db_name}.countrygdp; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * -FROM ${external_stats_db_name}.roarmap; +FROM ${external_stats_db_name}.roarmap; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * -FROM ${external_stats_db_name}.rndexpediture; +FROM ${external_stats_db_name}.rndexpediture; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS SELECT * -FROM ${external_stats_db_name}.licenses_normalized; +FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ @@ -33,23 +35,23 @@ FROM ${external_stats_db_name}.licenses_normalized; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ create or replace view ${stats_db_name}.usage_stats as -select * from openaire_prod_usage_stats.usage_stats; +select * from openaire_prod_usage_stats.usage_stats; /*EOS*/ create or replace view ${stats_db_name}.downloads_stats as -select * from openaire_prod_usage_stats.downloads_stats; +select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/ create or replace view ${stats_db_name}.pageviews_stats as -select * from openaire_prod_usage_stats.pageviews_stats; +select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/ create or replace view ${stats_db_name}.views_stats as -select * from openaire_prod_usage_stats.views_stats; +select * from openaire_prod_usage_stats.views_stats; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; +DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/ create table ${stats_db_name}.creation_date STORED AS PARQUET as -select date_format(current_date(), 'dd-MM-yyyy') as date; +select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 638fb0f7a..48373af9b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -1,110 +1,11 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------------------- ---------------------------------------------------------------- -- Post processing - Updates on main tables ---------------------------------------------------------------- ---------------------------------------------------------------- ---Datasource temporary table updates -UPDATE ${stats_db_name}.datasource_tmp -SET harvested='true' -WHERE datasource_tmp.id IN (SELECT DISTINCT d.id - FROM ${stats_db_name}.datasource_tmp d, - ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); - --- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables -UPDATE ${stats_db_name}.project_tmp -SET haspubs='yes' -WHERE project_tmp.id IN (SELECT pr.id - FROM ${stats_db_name}.project_results pr, - ${stats_db_name}.result r - WHERE pr.result = r.id - AND r.type = 'publication'); -DROP TABLE IF EXISTS ${stats_db_name}.stored purge; - -CREATE TABLE ${stats_db_name}.project stored as parquet as -SELECT p.id, - p.acronym, - p.title, - p.funder, - p.funding_lvl0, - p.funding_lvl1, - p.funding_lvl2, - p.ec39, - p.type, - p.startdate, - p.enddate, - p.start_year, - p.end_year, - p.duration, - CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, - CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, - CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, - CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, - p.callidentifier, - p.code, - p.totalcost, - p.fundedamount, - p.currency -FROM ${stats_db_name}.project_tmp p - LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np - FROM ${stats_db_name}.project_results pr - INNER JOIN ${stats_db_name}.result r ON pr.result = r.id - WHERE r.type = 'publication' - GROUP BY pr.id) AS prr1 on prr1.id = p.id - LEFT JOIN (SELECT pp.id, - max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub, - count(distinct r.id) AS dp - FROM ${stats_db_name}.project_tmp pp, - ${stats_db_name}.project_results pr, - ${stats_db_name}.result r - WHERE pp.id = pr.id - AND pr.result = r.id - AND r.type = 'publication' - AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 - GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; - -UPDATE ${stats_db_name}.publication_tmp -SET delayed = 'yes' -WHERE publication_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); - -UPDATE ${stats_db_name}.dataset_tmp -SET delayed = 'yes' -WHERE dataset_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); - -UPDATE ${stats_db_name}.software_tmp -SET delayed = 'yes' -WHERE software_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); - -UPDATE ${stats_db_name}.otherresearchproduct_tmp -SET delayed = 'yes' -WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); - CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, @@ -116,4 +17,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; \ No newline at end of file + AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 0a1904de7..4f0b45fed 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,42 +1,4 @@ ------------------------------------------------------------------------------------------------------- --- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; - -CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS -SELECT * -FROM ${stats_db_name}.datasource_tmp; - -DROP TABLE IF EXISTS ${stats_db_name}.publication purge; - -CREATE TABLE ${stats_db_name}.publication stored AS parquet AS -SELECT * -FROM ${stats_db_name}.publication_tmp; - -DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; - -CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS -SELECT * -FROM ${stats_db_name}.dataset_tmp; - -DROP TABLE IF EXISTS ${stats_db_name}.software purge; - -CREATE TABLE ${stats_db_name}.software stored AS parquet AS -SELECT * -FROM ${stats_db_name}.software_tmp; - -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; - -CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS -SELECT * -FROM ${stats_db_name}.otherresearchproduct_tmp; - -DROP TABLE ${stats_db_name}.project_tmp; -DROP TABLE ${stats_db_name}.datasource_tmp; -DROP TABLE ${stats_db_name}.publication_tmp; -DROP TABLE ${stats_db_name}.dataset_tmp; -DROP TABLE ${stats_db_name}.software_tmp; -DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; +set mapred.job.queue.name=analytics; /*EOS*/ ---------------------------------------------- -- Re-creating views from final parquet tables @@ -54,4 +16,4 @@ SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct; +FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 8c1dbdc4d..a590c190e 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Additional relations @@ -5,10 +7,10 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -16,12 +18,12 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -29,12 +31,12 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -42,12 +44,12 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -55,7 +57,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS SELECT * FROM ${stats_db_name}.publication_sources @@ -64,24 +66,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as -select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid +select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid FROM ${openaire_db_name}.result res LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type - WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; + WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as -select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype +select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target @@ -91,12 +93,12 @@ where reltype='resultResult' and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as -select substr(target, 4) as id, count(distinct substr(source, 4)) as citations +select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target @@ -108,12 +110,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(target, 4); +group by substr(target, 4); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as -select substr(source, 4) as id, count(distinct substr(target, 4)) as references +select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target @@ -125,4 +127,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(source, 4); \ No newline at end of file +group by substr(source, 4); /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index dafec9a6f..9e71b88f5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -1,4 +1,5 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Additional relations @@ -6,33 +7,33 @@ set mapred.job.queue.name=analytics; -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses @@ -41,29 +42,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses UNION ALL SELECT * FROM ${stats_db_name}.software_licenses UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS -select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid -from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; +select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid +from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as -SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource +SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource FROM ( - SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource + SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as -select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result +select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute -WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; +WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 65a5d789f..08609afff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------------------ ------------------------------------------------------ @@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics; ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -18,15 +18,15 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -36,15 +36,15 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -54,15 +54,15 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -72,13 +72,13 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed @@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; +select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; +DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as -select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, +select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids -where measures_ids.id!='views' and measures_ids.id!='downloads'; +where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as -select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, +select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, cast(rel.properties[0].value as double) apc_amount, rel.properties[1].value apc_currency from ${openaire_db_name}.relation rel join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.result r on r.id=rel.target -where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; +where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index e3d910454..d61b4d2ef 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,27 +1,27 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------- --- Extra tables, mostly used by indicators -DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as -select r.id, count(distinct p.id) as count +select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as -select r.id, count(distinct p.funder) as count +select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/ create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( @@ -30,39 +30,39 @@ with rcount as ( left outer join ${stats_db_name}.result_projects rp on rp.project=p.id left outer join ${stats_db_name}.result r on r.id=rp.id group by r.type, p.id ) -select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, +select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='other' then rcount.count else 0 end) as other from rcount -group by rcount.pid; +group by rcount.pid; /*EOS*/ -create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; -create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; -create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; -create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; -create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; -create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; +create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/ +create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/ +create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/ +create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/ +create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/ create table if not exists ${stats_db_name}.result_instance stored as parquet as -select distinct r.* +select /*+ COALESCE(100) */ distinct r.* from ( select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r -join ${stats_db_name}.result res on res.id=r.id; +join ${stats_db_name}.result res on res.id=r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as -select distinct r.id, r.amount, r.currency +select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; +where r.amount is not null; /*EOS*/ -create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; +create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index f5b950fe8..6e7f00b53 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,7 +1,7 @@ -- Sprint 1 ---- drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as -select distinct p.id, coalesce(green_oa, 0) as green_oa +select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa from ${stats_db_name}.publication p left outer join ( select p.id, 1 as green_oa @@ -12,7 +12,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as -select distinct p.id, coalesce(grey_lit, 0) as grey_lit +select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit from ${stats_db_name}.publication p left outer join ( select p.id, 1 as grey_lit @@ -23,7 +23,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as -select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref +select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from ${stats_db_name}.publication p left outer join ( select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri @@ -33,7 +33,7 @@ left outer join ( -- Sprint 2 ---- drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as -select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license +select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from ${stats_db_name}.result r left outer join ( select r.id, license.type as lic from ${stats_db_name}.result r @@ -42,7 +42,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as -select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url +select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from ${stats_db_name}.result r left outer join ( select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -52,12 +52,12 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as -select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract +select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from ${stats_db_name}.publication; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as -select distinct r.id, coalesce(has_orcid, 0) as has_orcid +select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid from ${stats_db_name}.result r left outer join ( select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/ @@ -66,7 +66,7 @@ left outer join ( ---- Sprint 3 ---- drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as -select distinct r.result as id, coalesce(fundref, 0) as fundref +select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref from ${stats_db_name}.project_results r left outer join ( select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/ @@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par SELECT ro.organization organization, ro.id, o.name from ${stats_db_name}.result_organization ro join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null) - select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations + select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ @@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store from ${stats_db_name}.result_organization ro join ${stats_db_name}.organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) - select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ @@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa select o.id organization, o.name, ro.project as project from ${stats_db_name}.organization o join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null) - select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.name<>o2.name @@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor select o.id organization, o.name, o.country , ro.project as project from ${stats_db_name}.organization o join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null) - select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country @@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.project p on p.id=op.project where country <> 'UNKNOWN') - select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations + select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 join tmp as f2 on f1.project=f2.project where f1.country<>f2.country @@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as select distinct country, ro.id as result from ${stats_db_name}.organization o join ${stats_db_name}.result_organization ro on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) - select o1.country country1, o2.country country2, count(o1.result) as collaborations + select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country @@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as ---- Sprint 4 ---- drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as - select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal + select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from ${stats_db_name}.publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal @@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as - select distinct pd.id, coalesce(is_transformative, 0) as is_transformative + select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative from ${stats_db_name}.publication pd left outer join ( select pd.id, 1 as is_transformative @@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as - select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open + select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from ${stats_db_name}.result_instance ri left outer join ( select ri.id, 1 as pub_closed_other_open @@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as ---- Sprint 5 ---- drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as - select id, count(id) as number_of_copies + select /*+ COALESCE(100) */ id, count(id) as number_of_copies from ${stats_db_name}.result_instance group by id; /*EOS*/ ---- Sprint 6 ---- drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as - SELECT result_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as - SELECT result_id, repository_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as - SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as - SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a UNION ALL select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo ) - SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold + SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold FROM ${stats_db_name}.publication pd left outer join ( select pd.id, 1 as is_gold @@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as FROM ${stats_db_name}.datasource WHERE issn_online IS NOT NULL ) as issn WHERE LENGTH(issn) > 7) - SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa + SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa FROM ${stats_db_name}.publication_datasources pd LEFT OUTER JOIN ( SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd @@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select distinct p.id, coalesce(is_hybrid, 0) is_hybrid +select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid from ${stats_db_name}.publication p left outer join ( select p.id, 1 as is_hybrid @@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet where cast(year as int)>2003 group by ro.organization) --return results_fair/all_results - select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization; /*EOS*/ @@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as -select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ @@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as -select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/ @@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as -select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ @@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as - select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/ @@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as -select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/ @@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as -select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/ @@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as -select allpubsshare.organization, +select /*+ COALESCE(100) */ allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare @@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as -select cast(allpubsshare.year as int) year, allpubsshare.organization, +select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare @@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as -select distinct p.id, coalesce(has_preprint, 0) as has_preprint +select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint from ${stats_db_name}.publication_classifications p left outer join ( select p.id, 1 as has_preprint @@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as -select distinct p.id, coalesce(is_subscription, 0) as is_subscription +select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription from ${stats_db_name}.publication p left outer join( select p.id, 1 as is_subscription from ${stats_db_name}.publication p @@ -640,7 +640,7 @@ from ${stats_db_name}.publication p drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as -select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid +select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from ${stats_db_name}.result p left outer join ( select p.id, 1 as result_with_pid @@ -654,7 +654,7 @@ group by rf.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as -select distinct p.id as id, coalesce(is_interdisciplinary, 0) +select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0) as is_interdisciplinary from pub_fos_totals p left outer join ( @@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as -select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa +select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication p left outer join ( select p.id, 1 as is_bronze_oa @@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as -select pry.project_id, pry.acronym, pry.result_id, +select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id, coalesce(is_project_result_after, 0) as is_project_result_after from project_year_result_year pry left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after @@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as -select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s +select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s from ${stats_db_name}.funder f left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp @@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu join ${stats_db_name}.project p on p.id=rp.project where cast(year as int)>2003 group by p.funder) -select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness +select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness from allresults join result_fair on result_fair.funder=allresults.funder; /*EOS*/ @@ -745,7 +745,7 @@ allresults as join ${stats_db_name}.result r on r.id=rc.id where cast(year as int)>2003 group by rc.ri_initiative) -select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness +select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness from allresults join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/ @@ -817,16 +817,14 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as -select allpubsshare.funder, - (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) - +(case when d is null then 0 else 1 end)) - funder_openess FROM allpubsshare - left outer join (select funder,d from - alldatasetssshare) tmp1 - on tmp1.funder=allpubsshare.funder - left outer join (select funder,s from - allsoftwaresshare) tmp2 - on tmp2.funder=allpubsshare.funder; /*EOS*/ +select /*+ COALESCE(100) */ allpubsshare.funder, + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) + +(case when d is null then 0 else 1 end)) funder_openess +FROM allpubsshare + left outer join (select funder,d from alldatasetssshare) tmp1 + on tmp1.funder=allpubsshare.funder + left outer join (select funder,s from allsoftwaresshare) tmp2 + on tmp2.funder=allpubsshare.funder; /*EOS*/ DROP VIEW pubs_oa; /*EOS*/ DROP VIEW datasets_oa; /*EOS*/ @@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as -select allpubsshare.ri_initiative, +select /*+ COALESCE(100) */ allpubsshare.ri_initiative, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) ris_openess FROM allpubsshare @@ -943,7 +941,7 @@ with result_findable as join ${stats_db_name}.project p on p.id=rp.project where cast(year as int)>2003 group by p.funder) -select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable +select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable from allresults join result_findable on result_findable.funder=allresults.funder; /*EOS*/ @@ -952,41 +950,43 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as with result_contexts as -(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc -join ${stats_db_name}.concept on concept.id=rc.concept -join ${stats_db_name}.category on category.id=concept.category -join ${stats_db_name}.context on context.id=category.context), -result_findable as - (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc - join ${stats_db_name}.result r on r.id=rc.id - join ${stats_db_name}.result_pids rp on rp.id=r.id - where cast(r.year as int)>2003 - group by rc.ri_initiative), -allresults as -(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc - join ${stats_db_name}.result r on r.id=rc.id - where cast(r.year as int)>2003 - group by rc.ri_initiative) -select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable + (select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc + join ${stats_db_name}.concept on concept.id=rc.concept + join ${stats_db_name}.category on category.id=concept.category + join ${stats_db_name}.context on context.id=category.context), + result_findable as + (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc + join ${stats_db_name}.result r on r.id=rc.id + join ${stats_db_name}.result_pids rp on rp.id=r.id + where cast(r.year as int)>2003 + group by rc.ri_initiative), + allresults as + (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc + join ${stats_db_name}.result r on r.id=rc.id + where cast(r.year as int)>2003 + group by rc.ri_initiative) +select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable from allresults join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ +drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/ + create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as with org_names_pids as -(select org.id,name, pid from ${stats_db_name}.organization org -join ${stats_db_name}.organization_pids op on org.id=op.id), -publicly_funded_orgs as -(select distinct name from -(select pf.name from stats_ext.insitutions_for_publicly_funded pf -join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' -union all -select pf.name from stats_ext.insitutions_for_publicly_funded pf -join ${stats_db_name}.project p on p.funder=pf.name -union all -select op.name from stats_ext.insitutions_for_publicly_funded pf -join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) -and pf.publicly_funded='yes') foo) -select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded + (select org.id,name, pid from ${stats_db_name}.organization org + join ${stats_db_name}.organization_pids op on org.id=op.id), + publicly_funded_orgs as + (select distinct name from + (select pf.name from stats_ext.insitutions_for_publicly_funded pf + join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' + union all + select pf.name from stats_ext.insitutions_for_publicly_funded pf + join ${stats_db_name}.project p on p.funder=pf.name + union all + select op.name from stats_ext.insitutions_for_publicly_funded pf + join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) + and pf.publicly_funded='yes') foo) +select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded from ${stats_db_name}.publication p left outer join ( select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro @@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as -select distinct p.id, coalesce(green_with_license, 0) as green_with_license +select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license from ${stats_db_name}.publication p left outer join ( select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p @@ -1006,7 +1006,7 @@ left outer join ( drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ create table ${stats_db_name}.result_country stored as parquet as -select distinct id, country +select /*+ COALESCE(100) */ distinct id, country from ( select ro.id, o.country from ${stats_db_name}.result_organization ro @@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as -select distinct r.id, coalesce(oa_with_license,0) as oa_with_license +select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/ @@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as with without_license as -(select distinct id from ${stats_db_name}.indi_result_oa_with_license -where oa_with_license=0) -select distinct r.id, coalesce(oa_without_license,0) as oa_without_license + (select distinct id from ${stats_db_name}.indi_result_oa_with_license + where oa_with_license=0) +select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_without_license from ${stats_db_name}.result r @@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /* create table ${stats_db_name}.indi_result_under_transformative stored as parquet as with transformative_dois as ( select distinct doi from stats_ext.transformative_facts) -select distinct r.id, coalesce(under_transformative,0) as under_transformative +select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative from ${stats_db_name}.result r left outer join ( select distinct rp.id, 1 as under_transformative diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index c837ea579..80256e2df 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -1,30 +1,30 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ---------------------------------------------------- -- Shortcuts for various definitions in stats db --- ---------------------------------------------------- -- Peer reviewed: -drop table if exists ${stats_db_name}.result_peerreviewed purge; +drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as -select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed +select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id -left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ -- Green OA: -drop table if exists ${stats_db_name}.result_greenoa purge; +drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as -select r.id, case when green.green_oa=1 then true else false end as green +select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r -left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/ -- GOLD OA: -drop table if exists ${stats_db_name}.result_gold purge; +drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as -select r.id, case when gold.is_gold=1 then true else false end as gold +select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r - left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index fe3bb6799..a2be22603 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,58 +1,26 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ --- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, +-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold, -- peer reviewed) -drop table if exists ${stats_db_name}.result_tmp; -CREATE TABLE ${stats_db_name}.result_tmp ( - id STRING, - title STRING, - publisher STRING, - journal STRING, - `date` STRING, - `year` INT, - bestlicence STRING, - access_mode STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING , - peer_reviewed BOOLEAN, - green BOOLEAN, - gold BOOLEAN) -clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); +drop view if exists ${stats_db_name}.result; /*EOS*/ +drop table if exists ${stats_db_name}.result; /*EOS*/ -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.publication r +CREATE TABLE ${stats_db_name}.result stored as parquet as +SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold +FROM ( + (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.publication) + UNION ALL + (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.dataset) + UNION ALL + (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.software) + UNION ALL + (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.otherresearchproduct) + ) r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.dataset r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.software r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.otherresearchproduct r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; - -drop table if exists ${stats_db_name}.result; -drop view if exists ${stats_db_name}.result; -create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 4f7247e14..0abec2358 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ -------------------------------------------------------------- -------------------------------------------------------------- @@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics; -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; -CREATE TABLE ${stats_db_name}.publication_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING -) - clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); +DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ -INSERT INTO ${stats_db_name}.publication_tmp -SELECT substr(p.id, 4) as id, - p.title[0].value as title, - p.publisher.value as publisher, - p.journal.name as journal, - p.dateofacceptance.value as date, - date_format(p.dateofacceptance.value, 'yyyy') as year, - p.bestaccessright.classname as bestlicence, - p.embargoenddate.value as embargo_end_date, - false as delayed, - size(p.author) as authors, - concat_ws('\u003B', p.source.value) as source, - case when size(p.description) > 0 then true else false end as abstract, - 'publication' as type -from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +CREATE TABLE ${stats_db_name}.publication stored as parquet as +with pub_pr as ( + select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.publication pub + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false +), + pub_delayed as ( + select pub_id, max(delayed) as delayed + from pub_pr + group by pub_id + ) +select /*+ COALESCE(100) */ + substr(pub.id, 4) as id, + pub.title[0].value as title, + pub.publisher.value as publisher, + pub.journal.name as journal, + pub.dateofacceptance.value as date, + date_format(pub.dateofacceptance.value, 'yyyy') as year, + pub.bestaccessright.classname as bestlicence, + pub.embargoenddate.value as embargo_end_date, + coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects. + size(pub.author) as authors, + concat_ws('\u003B', pub.source.value) as source, + case when size(pub.description) > 0 then true else false end as abstract, + 'publication' as type +from ${openaire_db_name}.publication pub + left outer join pub_delayed on pub.id=pub_delayed.pub_id +where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; + +DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, instancetype.classname as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance @@ -73,44 +73,44 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS -select substr(p.id, 4) as id, p.language.classname as language +select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as -select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; + and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 4abb6bdbc..4940bb96d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index a8392b226..7b52b3b20 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + drop database if exists TARGET cascade; create database if not exists TARGET; @@ -81,11 +83,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden - 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology - 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University - 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ) )) foo; create view if not exists TARGET.category as select * from SOURCE.category; @@ -256,7 +264,6 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); -create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql index 4469782f0..0f3dc1d4f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql index a28206d56..2a082c2cd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql index ce6475c22..759843d68 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 62c68c625..714e1c402 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; @@ -65,5 +67,11 @@ create table TARGET.result stored as parquet as 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University - 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications - ))) foo; \ No newline at end of file + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment + ))) foo; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 66620ac38..85d90eaf1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,15 +1,17 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + create table ${observatory_db_name}.result_cc_licence stored as parquet as -select r.id, coalesce(rln.count, 0) > 0 as cc_licence +select /*+ COALESCE(100) */ r.id, coalesce(rln.count, 0) > 0 as cc_licence from ${stats_db_name}.result r left outer join ( select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count from ${stats_db_name}.result_licenses rl group by rl.id -) rln on rln.id=r.id; +) rln on rln.id=r.id; /*EOS*/ create table ${observatory_db_name}.result_affiliated_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -35,11 +37,11 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -65,11 +67,11 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -95,11 +97,11 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -127,10 +129,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -158,10 +160,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -187,10 +189,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -216,10 +218,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -247,10 +249,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -278,10 +280,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -309,10 +311,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_year stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -340,11 +342,11 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_deposited_year_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -372,10 +374,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -403,10 +405,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -434,10 +436,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -465,10 +467,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -496,10 +498,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -529,10 +531,10 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -562,4 +564,4 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 0384de4ec..0e1e02b12 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Dataset table/view and Dataset related tables/views @@ -5,75 +7,74 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.dataset_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING +CREATE TABLE ${stats_db_name}.dataset stored as parquet as +with datast_pr as ( + select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.dataset datast + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false +), +datast_delayed as ( + select datast_id, max(delayed) as delayed + from datast_pr + group by datast_id ) - clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +select /*+ COALESCE(100) */ + substr(datast.id, 4) as id, + datast.title[0].value as title, + datast.publisher.value as publisher, + cast(null as string) as journal, + datast.dateofacceptance.value as date, + date_format(datast.dateofacceptance.value, 'yyyy') as year, + datast.bestaccessright.classname as bestlicence, + datast.embargoenddate.value as embargo_end_date, + coalesce(datast_delayed.delayed, false) as delayed, -- It's delayed, when the dataset was published after the end of the project. + size(datast.author) as authors, + concat_ws('\u003B', datast.source.value) as source, + case when size(datast.description) > 0 then true else false end as abstract, + 'dataset' as type +from ${openaire_db_name}.dataset datast + left outer join datast_delayed on datast.id=datast_delayed.datast_id +where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.dataset_tmp -SELECT substr(d.id, 4) AS id, - d.title[0].value AS title, - d.publisher.value AS publisher, - cast(null AS string) AS journal, - d.dateofacceptance.value as date, - date_format(d.dateofacceptance.value, 'yyyy') AS year, - d.bestaccessright.classname AS bestlicence, - d.embargoenddate.value AS embargo_end_date, - false AS delayed, - size(d.author) AS authors, - concat_ws('\u003B', d.source.value) AS source, - CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, - 'dataset' AS type -FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; -DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; + and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS -SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.dataset p @@ -82,35 +83,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index d8f4d65e4..0ccb17fcc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------- -------------------------------------------------------- -- Software table/view and Software related tables/views @@ -5,72 +7,74 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; -CREATE TABLE ${stats_db_name}.software_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING +DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/ + +CREATE TABLE ${stats_db_name}.software stored as parquet as +with soft_pr as ( + select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.software soft + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false +), +soft_delayed as ( + select soft_id, max(delayed) as delayed + from soft_pr + group by soft_id ) - clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +select /*+ COALESCE(100) */ + substr(soft.id, 4) as id, + soft.title[0].value as title, + soft.publisher.value as publisher, + cast(null as string) as journal, + soft.dateofacceptance.value as date, + date_format(soft.dateofacceptance.value, 'yyyy') as year, + soft.bestaccessright.classname as bestlicence, + soft.embargoenddate.value as embargo_end_date, + coalesce(soft_delayed.delayed, false) as delayed, -- It's delayed, when the software was published after the end of the project. + size(soft.author) as authors, + concat_ws('\u003B', soft.source.value) as source, + case when size(soft.description) > 0 then true else false end as abstract, + 'software' as type +from ${openaire_db_name}.software soft + left outer join soft_delayed on soft.id=soft_delayed.soft_id +where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.software_tmp -SELECT substr(s.id, 4) as id, - s.title[0].value AS title, - s.publisher.value AS publisher, - CAST(NULL AS string) AS journal, - s.dateofacceptance.value AS DATE, - date_format(s.dateofacceptance.value, 'yyyy') AS YEAR, - s.bestaccessright.classname AS bestlicence, - s.embargoenddate.value AS embargo_end_date, - FALSE AS delayed, - SIZE(s.author) AS authors, - concat_ws('\u003B', s.source.value) AS source, - CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, - 'software' as type -from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; -DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS -SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; + and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ + +DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.software p @@ -79,35 +83,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS -select substr(p.id, 4) AS id, p.language.classname AS language +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index fae0fbb63..cd7834d84 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Otherresearchproduct table/view and Otherresearchproduct related tables/views @@ -5,101 +7,103 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as +with other_pr as ( + select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.otherresearchproduct other + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false +), +other_delayed as ( + select other_id, max(delayed) as delayed + from other_pr + group by other_id +) +select /*+ COALESCE(100) */ + substr(other.id, 4) as id, + other.title[0].value as title, + other.publisher.value as publisher, + cast(null as string) as journal, + other.dateofacceptance.value as date, + date_format(other.dateofacceptance.value, 'yyyy') as year, + other.bestaccessright.classname as bestlicence, + other.embargoenddate.value as embargo_end_date, + false as delayed, + size(other.author) as authors, + concat_ws('\u003B', other.source.value) as source, + case when size(other.description) > 0 then true else false end as abstract, + 'other' as type +from ${openaire_db_name}.otherresearchproduct other + left outer join other_delayed on other.id=other_delayed.other_id +where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.otherresearchproduct_tmp -SELECT substr(o.id, 4) AS id, - o.title[0].value AS title, - o.publisher.value AS publisher, - CAST(NULL AS string) AS journal, - o.dateofacceptance.value AS DATE, - date_format(o.dateofacceptance.value, 'yyyy') AS year, - o.bestaccessright.classname AS bestlicence, - o.embargoenddate.value as embargo_end_date, - FALSE AS delayed, - SIZE(o.author) AS authors, - concat_ws('\u003B', o.source.value) AS source, - CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, - 'other' AS type -FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; -- Otherresearchproduct_citations -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS -SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; + and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ + +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p - LEFT OUTER JOIN(SELECT substr(d.id, 4) id + LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 165f77946..d261c96e2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -1,110 +1,120 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS -SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ create table ${stats_db_name}.project_classification STORED AS PARQUET as -select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 +select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.project_tmp -( - id STRING, - acronym STRING, - title STRING, - funder STRING, - funding_lvl0 STRING, - funding_lvl1 STRING, - funding_lvl2 STRING, - ec39 STRING, - type STRING, - startdate STRING, - enddate STRING, - start_year INT, - end_year INT, - duration INT, - haspubs STRING, - numpubs INT, - daysforlastpub INT, - delayedpubs INT, - callidentifier STRING, - code STRING, - totalcost FLOAT, - fundedamount FLOAT, - currency STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +CREATE TABLE ${stats_db_name}.project stored as parquet as +with pr_pub as ( + select pr.id as pr_id, pub.id as pub_id, + (case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed, + max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub + from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication + where datainfo.deletedbyinference = false and datainfo.invisible = false) pub + join ${openaire_db_name}.relation rel + on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project + where datainfo.deletedbyinference = false and datainfo.invisible = false) pr + on pr.id=rel.target + group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate +), +num_pubs_pr as ( + select pr_id, count( distinct pub_id) as num_pubs + from pr_pub + group by pr_id +), +pub_delayed as ( + select pr_id, pub_id, max(delayed) as delayed + from pr_pub + group by pr_id, pub_id +), +num_pub_delayed as ( + select pr_id, count(distinct pub_id) as num_delayed + from pub_delayed + where delayed + group by pr_id +) +select /*+ COALESCE(100) */ + substr(p.id, 4) as id, + p.acronym.value as acronym, + p.title.value as title, + xpath_string(p.fundingtree[0].value, '//funder/name') as funder, + xpath_string(p.fundingtree[0].value, '//funding_level_0/name') as funding_lvl0, + xpath_string(p.fundingtree[0].value, '//funding_level_1/name') as funding_lvl1, + xpath_string(p.fundingtree[0].value, '//funding_level_2/name') as funding_lvl2, + p.ecsc39.value as ec39, + p.contracttype.classname as type, + p.startdate.value as startdate, + p.enddate.value as enddate, + year(p.startdate.value) as start_year, + year(p.enddate.value) as end_year, + cast(months_between(p.enddate.value, p.startdate.value) as int) as duration, + case when pr_pub.pub_id is null then 'no' else 'yes' end as haspubs, + num_pubs_pr.num_pubs as numpubs, + pr_pub.daysForlastPub as daysForlastPub, + npd.num_delayed as delayedpubs, + p.callidentifier.value as callidentifier, + p.code.value as code, + p.totalcost as totalcost, + p.fundedamount as fundedamount, + p.currency.value as currency +from ${openaire_db_name}.project p +left outer join pr_pub on pr_pub.pr_id = p.id +left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id +left outer join num_pub_delayed npd on npd.pr_id=p.id +where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.project_tmp -SELECT substr(p.id, 4) AS id, - p.acronym.value AS acronym, - p.title.value AS title, - xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, - xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, - xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, - xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, - p.ecsc39.value AS ec39, - p.contracttype.classname AS type, - p.startdate.value AS startdate, - p.enddate.value AS enddate, - year(p.startdate.value) AS start_year, - year(p.enddate.value) AS end_year, - CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, - 'no' AS haspubs, - 0 AS numpubs, - 0 AS daysforlastpub, - 0 AS delayedpubs, - p.callidentifier.value AS callidentifier, - p.code.value AS code, - p.totalcost AS totalcost, - p.fundedamount AS fundedamount, - p.currency.value AS currency -FROM ${openaire_db_name}.project p -WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; -DROP TABLE IF EXISTS ${stats_db_name}.funder purge; +DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ create table ${stats_db_name}.funder STORED AS PARQUET as -select distinct xpath_string(fund, '//funder/id') as id, +select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS -SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, properties[0].value contribution, properties[1].value currency from ${openaire_db_name}.relation r LATERAL VIEW explode (r.properties) properties where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' -and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index c0993ef0b..bffd59ef1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------- ---------------------------------------------------- -- Result table/view and Result related tables/views @@ -7,16 +9,16 @@ -- Views on temporary tables that should be re-created in the end CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.publication_tmp +FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.software_tmp +FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.dataset_tmp +FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct_tmp; +FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ -- Views on final tables CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS @@ -30,7 +32,7 @@ SELECT * FROM ${stats_db_name}.dataset_datasources UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_datasources; +FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS SELECT * @@ -43,7 +45,7 @@ SELECT * FROM ${stats_db_name}.dataset_citations UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_citations; +FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS SELECT * @@ -56,7 +58,7 @@ SELECT * FROM ${stats_db_name}.dataset_classifications UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_classifications; +FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS SELECT * @@ -69,7 +71,7 @@ SELECT * FROM ${stats_db_name}.dataset_concepts UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_concepts; +FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS SELECT * @@ -82,7 +84,7 @@ SELECT * FROM ${stats_db_name}.dataset_languages UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_languages; +FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS SELECT * @@ -95,7 +97,7 @@ SELECT * FROM ${stats_db_name}.dataset_oids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_oids; +FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * @@ -108,7 +110,7 @@ SELECT * FROM ${stats_db_name}.dataset_pids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_pids; +FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * @@ -121,37 +123,44 @@ SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_topics; +FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; + +DROP TABLE IF EXISTS ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/ + +create table ${stats_db_name}.result_fos_base_tmp stored as parquet as +select /*+ COALESCE(100) */ id, topic from ${stats_db_name}.result_topics where type='Fields of Science and Technology classification'; /*EOS*/ + +DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/ create table ${stats_db_name}.result_fos stored as parquet as with - lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), - lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), - lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'), - lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') -select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 + lvl1 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '__ %'), + lvl2 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '____ %'), + lvl3 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '______ %'), + lvl4 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '________ %') +select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 from lvl1 - join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) - join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); + join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) + join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/ + +DROP TABLE ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.target like '50|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS -select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance +select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; - + JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 07204db0c..98225af14 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -- noinspection SqlNoDataSourceInspectionForFile ------------------------------------------------------------ @@ -5,108 +7,65 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.datasource_tmp -( - `id` string, - `name` STRING, - `type` STRING, - `dateofvalidation` STRING, - `yearofvalidation` string, - `harvested` BOOLEAN, - `piwik_id` INT, - `latitude` STRING, - `longitude` STRING, - `websiteurl` STRING, - `compatibility` STRING, - issn_printed STRING, - issn_online STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +CREATE TABLE ${stats_db_name}.datasource stored as parquet as +with piwik_datasource as ( + select id, split(originalidd, '\\:')[1] as piwik_id + from ${openaire_db_name}.datasource + lateral view explode(originalid) temp as originalidd + where originalidd like "piwik:%" +) +select /*+ COALESCE(100) */ + substr(dtrce.id, 4) as id, + case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name, + dtrce.datasourcetype.classname as type, + dtrce.dateofvalidation.value as dateofvalidation, + case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end as yearofvalidation, + case when res.d_id is null then false else true end as harvested, + case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end as piwik_id, + dtrce.latitude.value as latitude, + dtrce.longitude.value as longitude, + dtrce.websiteurl.value as websiteurl, + dtrce.openairecompatibility.classid as compatibility, + dtrce.journal.issnprinted as issn_printed, + dtrce.journal.issnonline as issn_online +from ${openaire_db_name}.datasource dtrce + left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id + left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id +where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/ --- Insert statement that takes into account the piwik_id of the openAIRE graph -INSERT INTO ${stats_db_name}.datasource_tmp -SELECT substr(d1.id, 4) AS id, - officialname.value AS name, - datasourcetype.classname AS type, - dateofvalidation.value AS dateofvalidation, - date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation, - FALSE AS harvested, - CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id, - d1.latitude.value AS latitude, - d1.longitude.value AS longitude, - d1.websiteurl.value AS websiteurl, - d1.openairecompatibility.classid AS compatibility, - d1.journal.issnprinted AS issn_printed, - d1.journal.issnonline AS issn_online -FROM ${openaire_db_name}.datasource d1 - LEFT OUTER JOIN - (SELECT id, split(originalidd, '\\:')[1] as piwik_id - FROM ${openaire_db_name}.datasource - LATERAL VIEW EXPLODE(originalid) temp AS originalidd - WHERE originalidd like "piwik:%") AS d2 - ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; --- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. --- Creating a temporary dual table that will be removed after the following insert - -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); - -INSERT INTO ${stats_db_name}.dual VALUES ('X'); - -INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, - `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) -SELECT 'other', - 'Other', - 'Repository', - NULL, - NULL, - false, - 0, - NULL, - NULL, - NULL, - 'unknown', - null, - null -FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -DROP TABLE ${stats_db_name}.dual; - -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; - -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, langs.languages AS language +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS -select substr(d.id, 4) as id, substr(cf.key, 4) as datasource +select /*+ COALESCE(100) */ substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; +FROM ${stats_db_name}.result_datasources; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index 19d301e27..f504a5c12 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -1,22 +1,24 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------------------- ---------------------------------------------------------------- -- Organization table/view and Organization related tables/views ---------------------------------------------------------------- ---------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS -SELECT substr(o.id, 4) as id, +SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country FROM ${openaire_db_name}.organization o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource -FROM ${stats_db_name}.datasource_organizations; +FROM ${stats_db_name}.datasource_organizations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id -FROM ${stats_db_name}.project_organizations; \ No newline at end of file +FROM ${stats_db_name}.project_organizations; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index d5f9ae886..d08cf8f59 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -150,190 +150,367 @@ - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())} + ${wf:actionData(wf:lastErrorNode())['stackTrace']}] - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step1 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step2 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step3 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step4 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step6 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step7 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step8 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step9 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step10 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step11 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step12 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step13 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step14 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step15 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step15_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} + - + ${jobTracker} ${nameNode} contexts.sh @@ -380,29 +557,51 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_1-definitions + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - + ${jobTracker} ${nameNode} finalizedb.sh @@ -415,7 +614,7 @@ - + ${jobTracker} ${nameNode} monitor.sh @@ -448,7 +647,7 @@ - + ${jobTracker} ${nameNode} observatory-pre.sh @@ -462,18 +661,29 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - observatory_db_name=${observatory_db_name} - + + yarn + cluster + Step21-createObservatoryDB + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql + --stats_db_name${stats_db_name} + --observatory_db_name${observatory_db_name} + - + ${jobTracker} ${nameNode} observatory-post.sh @@ -486,7 +696,7 @@ - + ${jobTracker} ${nameNode} copyDataToImpalaCluster.sh @@ -505,7 +715,7 @@ - + ${jobTracker} ${nameNode} createPDFsAggregated.sh @@ -521,7 +731,7 @@ - + ${jobTracker} ${nameNode} finalizeImpalaCluster.sh @@ -540,7 +750,7 @@ - + ${jobTracker} ${nameNode} updateCache.sh