From 032a401cbf930304affe67eb533bb4402ba8eebe Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 20 Feb 2023 09:29:20 +0200 Subject: [PATCH] Bug fixes --- .../oozie_app/copyDataToImpalaCluster.sh | 2 +- .../oa/graph/stats/oozie_app/indicators.sh | 6 +++-- .../stats/oozie_app/scripts/step15_5.sql | 16 +++++------ .../scripts/step16-createIndicatorsTables.sql | 16 +++++------ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 27 ++++++++++--------- 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 5b6752398..843877c90 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -14,7 +14,7 @@ function copydb() { # copy the databases from ocean to impala - #echo "copying $db" + echo "copying $db" hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp # change ownership to impala diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 473864315..2f1eefa0c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -7,7 +7,9 @@ then fi export TARGET=$1 -export SCRIPT_PATH=$2 +export STATS_EXT=$2 +export SCRIPT_PATH=$3 + export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false" export HADOOP_USER_NAME="oozie" @@ -15,7 +17,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive $HIVE_OPTS -f foo hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 1ae856355..61c0726ff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -29,17 +29,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; -create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; -create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; -create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; -create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; +create view ${stats_db_name}.rndexpenditure as select * from ${external_stats_db_name}.rndexpediture; +create view ${stats_db_name}.rndgdpexpenditure as select * from ${external_stats_db_name}.rndgdpexpenditure; +create view ${stats_db_name}.doctoratestudents as select * from ${external_stats_db_name}.doctoratestudents; +create view ${stats_db_name}.totalresearchers as select * from ${external_stats_db_name}.totalresearchers; +create view ${stats_db_name}.totalresearchersft as select * from ${external_stats_db_name}.totalresearchersft; +create view ${stats_db_name}.hrrst as select * from ${external_stats_db_name}.hrrst; create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( - select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, + select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; @@ -52,4 +52,4 @@ from ( join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index ac4d4202a..4fd941e5d 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -180,7 +180,7 @@ from publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; @@ -192,7 +192,7 @@ from publication pd left outer join ( select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; @@ -220,11 +220,11 @@ ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_print != "" UNION ALL SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), issn AS ( SELECT * @@ -291,7 +291,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as journal_is_oa, issn_1 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -301,7 +301,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as journal_is_oa, issn_2 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -343,7 +343,7 @@ create table if not exists indi_pub_hybrid stored as parquet as issn_1 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -354,7 +354,7 @@ create table if not exists indi_pub_hybrid stored as parquet as issn_2 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index e23bd0aa3..e9453d7b1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -108,6 +108,7 @@ ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step24-updateCache'} @@ -289,6 +290,7 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} @@ -313,6 +315,7 @@ ${nameNode} indicators.sh ${stats_db_name} + ${external_stats_db_name} ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh @@ -452,21 +455,21 @@ ${observatory_db_shadow_name} finalizeImpalaCluster.sh + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + - - - - - - - - - - - -