From 2a52a42169ecdede8024fc9fcbadb1008e49f912 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 6 Dec 2022 10:10:21 +0200 Subject: [PATCH 01/10] Added 4 institutions: -University of Modena and Reggio Emilia -Bilkent University -Saints Cyril and Methodius University of Skopje -University of Milan --- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 2bdcbfa3d..98dca7129 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -50,7 +50,11 @@ create table TARGET.result stored as parquet as 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona - 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University + 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia + 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University + 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan ) )) foo; compute stats TARGET.result; From 6449ff42073979226d1c02c08ee608173657bbef Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 22 Dec 2022 10:18:21 +0200 Subject: [PATCH 02/10] 1. Added a decision node to enables the workflow to make a selection on the execution path to follow 2. Added new organization 3. Added 5 new tables from Eurostast --- .../oa/graph/stats/oozie_app/scripts/step15_5.sql | 5 +++++ .../oozie_app/scripts/step20-createMonitorDB.sql | 8 +++++++- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 13 ++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 86ead4a2c..584de0a56 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -30,6 +30,11 @@ from rcount group by rcount.pid; create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; +create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; +create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; +create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; +create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 98dca7129..3e69ff58d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -10,6 +10,11 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_ create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create table TARGET.result stored as parquet as select distinct * from ( @@ -54,7 +59,8 @@ create table TARGET.result stored as parquet as 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje - 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork ) )) foo; compute stats TARGET.result; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 08d33f4e8..17dcd1fdd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -70,7 +70,18 @@ - + + + + + ${wf:conf('resumeFrom') eq 'Step1'} + ${wf:conf('resumeFrom') eq 'step20-createMonitorDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] From 592013d5ddf1bac85dee76bb84931b4a31ad36b0 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 23 Dec 2022 09:43:16 +0200 Subject: [PATCH 03/10] Added more steps in decision node --- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 17dcd1fdd..c68ae46ca 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -75,9 +75,31 @@ ${wf:conf('resumeFrom') eq 'Step1'} + ${wf:conf('resumeFrom') eq 'Step2'} + ${wf:conf('resumeFrom') eq 'Step3'} + ${wf:conf('resumeFrom') eq 'Step4'} + ${wf:conf('resumeFrom') eq 'Step5'} + ${wf:conf('resumeFrom') eq 'Step6'} + ${wf:conf('resumeFrom') eq 'Step7'} + ${wf:conf('resumeFrom') eq 'Step8'} + ${wf:conf('resumeFrom') eq 'Step9'} + ${wf:conf('resumeFrom') eq 'Step10'} + ${wf:conf('resumeFrom') eq 'Step11'} + ${wf:conf('resumeFrom') eq 'Step12'} + ${wf:conf('resumeFrom') eq 'Step13'} + ${wf:conf('resumeFrom') eq 'Step14'} + ${wf:conf('resumeFrom') eq 'Step15'} + ${wf:conf('resumeFrom') eq 'Step15_5'} + ${wf:conf('resumeFrom') eq 'Contexts'} + ${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'} + ${wf:conf('resumeFrom') eq 'Step16_1-definitions'} + ${wf:conf('resumeFrom') eq 'Step16_5'} + ${wf:conf('resumeFrom') eq 'Step19-finalize'} ${wf:conf('resumeFrom') eq 'step20-createMonitorDB'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} + ${wf:conf('resumeFrom') eq 'Step22'} From becb242c1797b3ba40e5e92b4dd263248d59b14d Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 4 Jan 2023 16:50:29 +0200 Subject: [PATCH 04/10] Monitor DB only Workflow --- dhp-workflows/dhp-monitor-update/pom.xml | 32 +++ .../monitor/oozie_app/config-default.xml | 34 +++ .../graph/monitor/oozie_app/monitor-post.sh | 21 ++ .../dhp/oa/graph/monitor/oozie_app/monitor.sh | 24 ++ .../oozie_app/scripts/createMonitorDB.sql | 241 ++++++++++++++++++ .../oa/graph/monitor/oozie_app/updateCache.sh | 4 + .../oa/graph/monitor/oozie_app/workflow.xml | 105 ++++++++ 7 files changed, 461 insertions(+) create mode 100644 dhp-workflows/dhp-monitor-update/pom.xml create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh create mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-monitor-update/pom.xml b/dhp-workflows/dhp-monitor-update/pom.xml new file mode 100644 index 000000000..ca0bb9837 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/pom.xml @@ -0,0 +1,32 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-monitor-update + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + false + + + + + diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml new file mode 100644 index 000000000..63fc84d75 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + stats_tool_api_url + ${stats_tool_api_url} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh new file mode 100644 index 000000000..b8c71681a --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh @@ -0,0 +1,21 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow monitor database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh new file mode 100644 index 000000000..f39bf4893 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -0,0 +1,24 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH + +echo "Creating monitor database" +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql new file mode 100644 index 000000000..e9e460cb0 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -0,0 +1,241 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +create view if not exists TARGET.funder as select * from SOURCE.funder; +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council + 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? + 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University + 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade + 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki + 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho + 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen + 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens + -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot + 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University + 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark + 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin + 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt + 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven + 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape + 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute + 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University + 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) + 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr + 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw + 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly + 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete + 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus + 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras + 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki + 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank + 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech + 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University + 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University + 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia + 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University + 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork + ) )) foo; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; + +create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; + +create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; + +create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; + +create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; + +create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; + +create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; + +create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; + +create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; + +create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; + +create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; + +create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; + +create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; + +create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; + +create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; + +create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; + +create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; + +create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; + +create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; + +create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; + +create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; + +create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; + +create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; + +create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); +create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); +create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; + +-- datasources +create view if not exists TARGET.datasource as select * from SOURCE.datasource; +create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; +create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations; +create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; + +create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; +ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; + +-- organizations +create view if not exists TARGET.organization as select * from SOURCE.organization; +create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources; +create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids; +create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects; +create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources; + +-- projects +create view if not exists TARGET.project as select * from SOURCE.project; +create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; +create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; +create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; +create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; + +create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; +ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; + +-- indicators +-- Sprint 1 ---- +create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +-- Sprint 2 ---- +create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +---- Sprint 3 ---- +create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; +create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; +create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; +create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; +create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; +create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; +create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; +---- Sprint 4 ---- +create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +---- Sprint 5 ---- +create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +---- Sprint 6 ---- +create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +---- Sprint 7 ---- +create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; +create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; +create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; +create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; +create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub; +create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year; +create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year; +create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; +create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; +create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; +create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_datasets_gold_oa; +--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_software_gold_oa; + diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh new file mode 100644 index 000000000..03aa535e1 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +curl --request GET $1/cache/updateCache +sleep 6h \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml new file mode 100644 index 000000000..2bcff70ac --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + stats_db_name + the target stats database name + + + stats_db_shadow_name + the name of the shadow schema + + + monitor_db_name + the target monitor db name + + + monitor_db_shadow_name + the name of the shadow monitor db + + + stats_tool_api_url + The url of the API of the stats tool. Is used to trigger the cache update. + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + monitor.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/createMonitorDB.sql + monitor.sh + + + + + + + + + ${jobTracker} + ${nameNode} + monitor-post.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + monitor-post.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + + + + + + + \ No newline at end of file From 686580a22068b6437ff4e8dafccbf919d06e2a77 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 11:18:03 +0200 Subject: [PATCH 05/10] - New Monitor DB workflow - New Organization added --- .../graph/monitor/oozie_app/monitor-post.sh | 21 -- .../dhp/oa/graph/monitor/oozie_app/monitor.sh | 16 +- .../oozie_app/scripts/createMonitorDB.sql | 293 ++++++------------ .../oa/graph/monitor/oozie_app/updateCache.sh | 4 - .../oa/graph/monitor/oozie_app/workflow.xml | 28 -- .../scripts/step20-createMonitorDB.sql | 3 +- 6 files changed, 107 insertions(+), 258 deletions(-) delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh delete mode 100644 dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh deleted file mode 100644 index b8c71681a..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh +++ /dev/null @@ -1,21 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -export SOURCE=$1 -export TARGET=$2 -export SHADOW=$3 - -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - -echo "Impala shell finished" - -echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh index f39bf4893..36cfcd325 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -8,17 +8,11 @@ fi export SOURCE=$1 export TARGET=$2 -export SHADOW=$3 -export SCRIPT_PATH=$4 +export SCRIPT_PATH=$3 -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" -export HADOOP_USER_NAME="oozie" - -echo "Getting file from " $SCRIPT_PATH -hdfs dfs -copyToLocal $SCRIPT_PATH +echo "Getting file from " $3 +hdfs dfs -copyToLocal $3 echo "Creating monitor database" -#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo -cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo -hive $HIVE_OPTS -f foo -echo "Hive shell finished" \ No newline at end of file +cat createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index e9e460cb0..2c46082fa 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -1,241 +1,148 @@ -drop database if exists TARGET cascade; -create database if not exists TARGET; +DROP TABLE IF EXISTS TARGET.result_new; -create view if not exists TARGET.category as select * from SOURCE.category; -create view if not exists TARGET.concept as select * from SOURCE.concept; -create view if not exists TARGET.context as select * from SOURCE.context; -create view if not exists TARGET.country as select * from SOURCE.country; -create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; -create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; -create view if not exists TARGET.funder as select * from SOURCE.funder; -create view if not exists TARGET.fundref as select * from SOURCE.fundref; -create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; -create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; -create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; -create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; -create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; -create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; - -create table TARGET.result stored as parquet as +create table TARGET.result_new stored as parquet as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( - 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" - 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council - 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? - 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University - 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade - 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki - 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho - 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen - 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens - -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot - 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University - 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark - 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin - 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt - 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven - 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape - 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute - 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University - 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg - 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) - 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr - 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw - 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly - 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete - 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus - 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras - 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki - 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank - 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech - 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University - 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona - 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University - 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia - 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University - 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje - 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan - 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork +-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University ) )) foo; -ANALYZE TABLE TARGET.result COMPUTE STATISTICS; +COMPUTE STATS TARGET.result_new; -create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; +INSERT INTO TARGET.result select * from TARGET.result_new; -create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; +INSERT INTO TARGET.result_citations select * from TARGET.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_citations; -create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; +INSERT INTO TARGET.result_references_oc select * from TARGET.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_references_oc; -create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; +INSERT INTO TARGET.result_citations_oc select * from TARGET.result_citations_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_citations_oc; -create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; +INSERT INTO TARGET.result_classifications select * from TARGET.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_classifications; -create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; +INSERT INTO TARGET.result_apc select * from TARGET.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_apc; -create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; +INSERT INTO TARGET.result_concepts select * from TARGET.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_concepts; -create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; +INSERT INTO TARGET.result_datasources select * from TARGET.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_datasources; -create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; +INSERT INTO TARGET.result_fundercount select * from TARGET.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_fundercount; -create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; +INSERT INTO TARGET.result_gold select * from TARGET.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_gold; -create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; +INSERT INTO TARGET.result_greenoa select * from TARGET.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_greenoa; -create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; +INSERT INTO TARGET.result_languages select * from TARGET.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_languages; -create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; -ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; +INSERT INTO TARGET.result_licenses select * from TARGET.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_licenses; -create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; +INSERT INTO TARGET.result_oids select * from TARGET.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_oids; -create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; +INSERT INTO TARGET.result_organization select * from TARGET.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_organization; -create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; +INSERT INTO TARGET.result_peerreviewed select * from TARGET.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_peerreviewed; -create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; +INSERT INTO TARGET.result_pids select * from TARGET.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_pids; -create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; +INSERT INTO TARGET.result_projectcount select * from TARGET.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_projectcount; -create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; +INSERT INTO TARGET.result_projects select * from TARGET.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_projects; -create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; +INSERT INTO TARGET.result_refereed select * from TARGET.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_refereed; -create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; +INSERT INTO TARGET.result_sources select * from TARGET.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_sources; -create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; +INSERT INTO TARGET.result_topics select * from TARGET.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_topics; -create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; +INSERT INTO TARGET.result_fos select * from TARGET.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.result_fos; -create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); -create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); -create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +create view TARGET.foo1 as select * from TARGET.result_result rr where rr.source in (select id from TARGET.result_new); +create view TARGET.foo2 as select * from TARGET.result_result rr where rr.target in (select id from TARGET.result_new); +INSERT INTO TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; - --- datasources -create view if not exists TARGET.datasource as select * from SOURCE.datasource; -create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; -create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations; -create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; - -create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; - --- organizations -create view if not exists TARGET.organization as select * from SOURCE.organization; -create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources; -create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids; -create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects; -create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources; - --- projects -create view if not exists TARGET.project as select * from SOURCE.project; -create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; -create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; -create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; -create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; - -create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; +COMPUTE STATS TARGET.result_result; -- indicators -- Sprint 1 ---- -create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; -create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; -create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_green_oa select * from TARGET.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_green_oa; +INSERT INTO TARGET.indi_pub_grey_lit select * from TARGET.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_grey_lit; +INSERT INTO TARGET.indi_pub_doi_from_crossref select * from TARGET.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_doi_from_crossref; -- Sprint 2 ---- -create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; -create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; -create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_has_cc_licence select * from TARGET.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_result_has_cc_licence; +INSERT INTO TARGET.indi_result_has_cc_licence_url select * from TARGET.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_result_has_cc_licence_url; +INSERT INTO TARGET.indi_pub_has_abstract select * from TARGET.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_has_abstract; +INSERT INTO TARGET.indi_result_with_orcid select * from TARGET.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_result_with_orcid; ---- Sprint 3 ---- -create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; -create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; -create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; -create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; -create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; -create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; -create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; +INSERT INTO TARGET.indi_funded_result_with_fundref select * from TARGET.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_funded_result_with_fundref; ---- Sprint 4 ---- -create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; -create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; -create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_diamond select * from TARGET.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_diamond; +INSERT INTO TARGET.indi_pub_in_transformative select * from TARGET.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_in_transformative; +INSERT INTO TARGET.indi_pub_closed_other_open select * from TARGET.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_closed_other_open; ---- Sprint 5 ---- -create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_no_of_copies select * from TARGET.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_result_no_of_copies; ---- Sprint 6 ---- -create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; -create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; -create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; -create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from TARGET.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_hybrid_oa_with_cc; +INSERT INTO TARGET.indi_pub_downloads select * from TARGET.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +COMPUTE STATS TARGET.indi_pub_downloads; +INSERT INTO TARGET.indi_pub_downloads_datasource select * from TARGET.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +COMPUTE STATS TARGET.indi_pub_downloads_datasource; +INSERT INTO TARGET.indi_pub_downloads_year select * from TARGET.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +COMPUTE STATS TARGET.indi_pub_downloads_year; +INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from TARGET.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +COMPUTE STATS TARGET.indi_pub_downloads_datasource_year; ---- Sprint 7 ---- -create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; -create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; -create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; -create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; -create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; -create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub; -create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year; -create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year; -create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; -create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; -create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; -create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; -create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; -create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_gold_oa select * from TARGET.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_gold_oa; +INSERT INTO TARGET.indi_pub_hybrid select * from TARGET.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_hybrid; + +INSERT INTO TARGET.indi_pub_has_preprint select * from TARGET.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_has_preprint; +INSERT INTO TARGET.indi_pub_in_subscribed select * from TARGET.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_pub_in_subscribed; +INSERT INTO TARGET.indi_result_with_pid select * from TARGET.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +COMPUTE STATS TARGET.indi_result_with_pid; --create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_datasets_gold_oa; --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; - +DROP TABLE TARGET.result_new; diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh deleted file mode 100644 index 03aa535e1..000000000 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/updateCache.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -curl --request GET $1/cache/updateCache -sleep 6h \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml index 2bcff70ac..dda645d8f 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -66,37 +66,9 @@ monitor.sh ${stats_db_name} ${monitor_db_name} - ${monitor_db_shadow_name} ${wf:appPath()}/scripts/createMonitorDB.sql monitor.sh - - - - - - - - ${jobTracker} - ${nameNode} - monitor-post.sh - ${stats_db_name} - ${monitor_db_name} - ${monitor_db_shadow_name} - monitor-post.sh - - - - - - - - ${jobTracker} - ${nameNode} - updateCache.sh - ${stats_tool_api_url} - updateCache.sh - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 3e69ff58d..885f7e4f7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -60,7 +60,8 @@ create table TARGET.result stored as parquet as 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan - 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University ) )) foo; compute stats TARGET.result; From 51f7ab5864ca80023b1cc7d350e4dda38dd1cd99 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 17:15:06 +0200 Subject: [PATCH 06/10] Bug fixes --- .../oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index 2c46082fa..7e1333f92 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -2,11 +2,7 @@ DROP TABLE IF EXISTS TARGET.result_new; create table TARGET.result_new stored as parquet as select distinct * from ( - select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) - union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) - union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + select * from result r where exists (select 1 from result_organization ro where ro.id=r.id and ro.organization in ( -- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University ) )) foo; From dd70c32ad7ab3fccc38dd6c42c04b6ca2c6d15b1 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 12 Jan 2023 17:18:05 +0200 Subject: [PATCH 07/10] Bug fixes --- .../dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index 7e1333f92..2eb95294a 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -2,7 +2,7 @@ DROP TABLE IF EXISTS TARGET.result_new; create table TARGET.result_new stored as parquet as select distinct * from ( - select * from result r where exists (select 1 from result_organization ro where ro.id=r.id and ro.organization in ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( -- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University ) )) foo; From db7d625ba9b436060e8be56452215945d80efca6 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 25 Jan 2023 12:22:21 +0200 Subject: [PATCH 08/10] =?UTF-8?q?Addedd=20Arts=20et=20M=C3=A9tiers=20Paris?= =?UTF-8?q?Tech=20organization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh | 2 +- .../oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql | 6 ++++-- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh index 36cfcd325..10c1ed4ca 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -13,6 +13,6 @@ export SCRIPT_PATH=$3 echo "Getting file from " $3 hdfs dfs -copyToLocal $3 -echo "Creating monitor database" +echo "Updating monitor database" cat createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - echo "Impala shell finished" diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql index 2eb95294a..265610e90 100644 --- a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/scripts/createMonitorDB.sql @@ -1,15 +1,17 @@ DROP TABLE IF EXISTS TARGET.result_new; -create table TARGET.result_new stored as parquet as +create table TARGET.result_new as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( -- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University +-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech ) )) foo; COMPUTE STATS TARGET.result_new; INSERT INTO TARGET.result select * from TARGET.result_new; +COMPUTE STATS TARGET.result; INSERT INTO TARGET.result_citations select * from TARGET.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); COMPUTE STATS TARGET.result_citations; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 885f7e4f7..c6b7d8ae2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -61,7 +61,8 @@ create table TARGET.result stored as parquet as 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech ) )) foo; compute stats TARGET.result; From 973d78a4d64718dbeb16ca4857f99419bdfade18 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 2 Feb 2023 08:03:54 +0200 Subject: [PATCH 09/10] Update step15_5.sql Added unpaywalls open access colors --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 584de0a56..753d61ca0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -39,7 +39,7 @@ create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( - select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, + select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; From 98c34263ed54655ca028753552a496392198795d Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 7 Feb 2023 08:14:48 +0200 Subject: [PATCH 10/10] Update step20-createMonitorDB.sql Add University of Cape Town organization --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index c6b7d8ae2..237f68fae 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -62,8 +62,9 @@ create table TARGET.result stored as parquet as 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech - ) )) foo; + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town + ))) foo; compute stats TARGET.result; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);