Stats wf executed on hive only #283

Merged
claudio.atzori merged 45 commits from antonis.lempesis/dnet-hadoop:beta into beta 2023-05-02 14:05:13 +02:00
5 changed files with 36 additions and 31 deletions
Showing only changes of commit 032a401cbf - Show all commits

View File

@ -14,7 +14,7 @@ function copydb() {
# copy the databases from ocean to impala
#echo "copying $db"
echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
# change ownership to impala

View File

@ -7,7 +7,9 @@ then
fi
export TARGET=$1
export SCRIPT_PATH=$2
export STATS_EXT=$2
export SCRIPT_PATH=$3
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
export HADOOP_USER_NAME="oozie"
@ -15,7 +17,7 @@ echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating indicators"
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive $HIVE_OPTS -f foo
hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql
echo "Indicators created"

View File

@ -29,17 +29,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
from rcount
group by rcount.pid;
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create view ${stats_db_name}.rndexpenditure as select * from ${external_stats_db_name}.rndexpediture;
create view ${stats_db_name}.rndgdpexpenditure as select * from ${external_stats_db_name}.rndgdpexpenditure;
create view ${stats_db_name}.doctoratestudents as select * from ${external_stats_db_name}.doctoratestudents;
create view ${stats_db_name}.totalresearchers as select * from ${external_stats_db_name}.totalresearchers;
create view ${stats_db_name}.totalresearchersft as select * from ${external_stats_db_name}.totalresearchersft;
create view ${stats_db_name}.hrrst as select * from ${external_stats_db_name}.hrrst;
create table ${stats_db_name}.result_instance stored as parquet as
select distinct r.*
from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom,
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id;
@ -52,4 +52,4 @@ from (
join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null;
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;
create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;

View File

@ -180,7 +180,7 @@ from publication_datasources pd
left outer join (
select pd.id, 1 as in_diamond_journal from publication_datasources pd
join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id;
@ -192,7 +192,7 @@ from publication pd
left outer join (
select pd.id, 1 as is_transformative from publication_datasources pd
join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id;
@ -220,11 +220,11 @@ ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM stats_ext.plan_s_jn
FROM STATS_EXT.plan_s_jn
WHERE issn_print != ""
UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
FROM stats_ext.plan_s_jn
FROM STATS_EXT.plan_s_jn
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
issn AS (
SELECT *
@ -291,7 +291,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
journal_is_oa,
issn_1 as issn
FROM
stats_ext.oa_journals
STATS_EXT.oa_journals
WHERE
issn_1 != ""
UNION
@ -301,7 +301,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
journal_is_oa,
issn_2 as issn
FROM
stats_ext.oa_journals
STATS_EXT.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*
@ -343,7 +343,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
issn_1 as issn,
has_apc
FROM
stats_ext.oa_journals
STATS_EXT.oa_journals
WHERE
issn_1 != ""
UNION
@ -354,7 +354,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
issn_2 as issn,
has_apc
FROM
stats_ext.oa_journals
STATS_EXT.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*

View File

@ -108,6 +108,7 @@
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
<case to="Step24-updateCache">${wf:conf('resumeFrom') eq 'Step24-updateCache'}</case>
<default to="Step1"/>
</switch>
</decision>
@ -289,6 +290,7 @@
<script>scripts/step15_5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Contexts"/>
<error to="Kill"/>
@ -313,6 +315,7 @@
<name-node>${nameNode}</name-node>
<exec>indicators.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${external_stats_db_name}</argument>
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
<file>indicators.sh</file>
</shell>
@ -452,21 +455,21 @@
<argument>${observatory_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="Step24-updateCache"/>
<error to="Kill"/>
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec>
<argument>${stats_tool_api_url}</argument>
<file>updateCache.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<!-- <action name="Step24-updateCache">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>updateCache.sh</exec>-->
<!-- <argument>${stats_tool_api_url}</argument>-->
<!-- <file>updateCache.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/>
</workflow-app>