Stats wf executed on hive only #283
|
@ -14,7 +14,7 @@ function copydb() {
|
|||
|
||||
# copy the databases from ocean to impala
|
||||
|
||||
#echo "copying $db"
|
||||
echo "copying $db"
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
|
||||
|
||||
# change ownership to impala
|
||||
|
|
|
@ -7,7 +7,9 @@ then
|
|||
fi
|
||||
|
||||
export TARGET=$1
|
||||
export SCRIPT_PATH=$2
|
||||
export STATS_EXT=$2
|
||||
export SCRIPT_PATH=$3
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
|
@ -15,7 +17,7 @@ echo "Getting file from " $SCRIPT_PATH
|
|||
hdfs dfs -copyToLocal $SCRIPT_PATH
|
||||
|
||||
echo "Creating indicators"
|
||||
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql
|
||||
echo "Indicators created"
|
||||
|
|
|
@ -29,17 +29,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
|
|||
from rcount
|
||||
group by rcount.pid;
|
||||
|
||||
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
||||
create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
|
||||
create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
|
||||
create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
|
||||
create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
|
||||
create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
|
||||
create view ${stats_db_name}.rndexpenditure as select * from ${external_stats_db_name}.rndexpediture;
|
||||
create view ${stats_db_name}.rndgdpexpenditure as select * from ${external_stats_db_name}.rndgdpexpenditure;
|
||||
create view ${stats_db_name}.doctoratestudents as select * from ${external_stats_db_name}.doctoratestudents;
|
||||
create view ${stats_db_name}.totalresearchers as select * from ${external_stats_db_name}.totalresearchers;
|
||||
create view ${stats_db_name}.totalresearchersft as select * from ${external_stats_db_name}.totalresearchersft;
|
||||
create view ${stats_db_name}.hrrst as select * from ${external_stats_db_name}.hrrst;
|
||||
|
||||
create table ${stats_db_name}.result_instance stored as parquet as
|
||||
select distinct r.*
|
||||
from (
|
||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
||||
join ${stats_db_name}.result res on res.id=r.id;
|
||||
|
@ -52,4 +52,4 @@ from (
|
|||
join ${stats_db_name}.result res on res.id=r.id
|
||||
where r.amount is not null;
|
||||
|
||||
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;
|
||||
create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
|
|
@ -180,7 +180,7 @@ from publication_datasources pd
|
|||
left outer join (
|
||||
select pd.id, 1 as in_diamond_journal from publication_datasources pd
|
||||
join datasource d on d.id=pd.datasource
|
||||
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
|
@ -192,7 +192,7 @@ from publication pd
|
|||
left outer join (
|
||||
select pd.id, 1 as is_transformative from publication_datasources pd
|
||||
join datasource d on d.id=pd.datasource
|
||||
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
and ps.is_transformative_journal=true) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
|
@ -220,11 +220,11 @@ ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
|
|||
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||
WITH hybrid_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||
FROM stats_ext.plan_s_jn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_print != ""
|
||||
UNION ALL
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||
FROM stats_ext.plan_s_jn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||
issn AS (
|
||||
SELECT *
|
||||
|
@ -291,7 +291,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
|
|||
journal_is_oa,
|
||||
issn_1 as issn
|
||||
FROM
|
||||
stats_ext.oa_journals
|
||||
STATS_EXT.oa_journals
|
||||
WHERE
|
||||
issn_1 != ""
|
||||
UNION
|
||||
|
@ -301,7 +301,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
|
|||
journal_is_oa,
|
||||
issn_2 as issn
|
||||
FROM
|
||||
stats_ext.oa_journals
|
||||
STATS_EXT.oa_journals
|
||||
WHERE
|
||||
issn_2 != "" ), issn AS ( SELECT
|
||||
*
|
||||
|
@ -343,7 +343,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
|
|||
issn_1 as issn,
|
||||
has_apc
|
||||
FROM
|
||||
stats_ext.oa_journals
|
||||
STATS_EXT.oa_journals
|
||||
WHERE
|
||||
issn_1 != ""
|
||||
UNION
|
||||
|
@ -354,7 +354,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
|
|||
issn_2 as issn,
|
||||
has_apc
|
||||
FROM
|
||||
stats_ext.oa_journals
|
||||
STATS_EXT.oa_journals
|
||||
WHERE
|
||||
issn_2 != "" ), issn AS ( SELECT
|
||||
*
|
||||
|
|
|
@ -108,6 +108,7 @@
|
|||
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
|
||||
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
|
||||
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
|
||||
<case to="Step24-updateCache">${wf:conf('resumeFrom') eq 'Step24-updateCache'}</case>
|
||||
<default to="Step1"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -289,6 +290,7 @@
|
|||
<script>scripts/step15_5.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Contexts"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -313,6 +315,7 @@
|
|||
<name-node>${nameNode}</name-node>
|
||||
<exec>indicators.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${external_stats_db_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
|
||||
<file>indicators.sh</file>
|
||||
</shell>
|
||||
|
@ -452,21 +455,21 @@
|
|||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="Step24-updateCache"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step24-updateCache">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>updateCache.sh</exec>
|
||||
<argument>${stats_tool_api_url}</argument>
|
||||
<file>updateCache.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step24-updateCache">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>updateCache.sh</exec>-->
|
||||
<!-- <argument>${stats_tool_api_url}</argument>-->
|
||||
<!-- <file>updateCache.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
||||
|
|
Loading…
Reference in New Issue