forked from D-Net/dnet-hadoop
Bug fixes
This commit is contained in:
parent
595192d510
commit
032a401cbf
|
@ -14,7 +14,7 @@ function copydb() {
|
||||||
|
|
||||||
# copy the databases from ocean to impala
|
# copy the databases from ocean to impala
|
||||||
|
|
||||||
#echo "copying $db"
|
echo "copying $db"
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
|
||||||
|
|
||||||
# change ownership to impala
|
# change ownership to impala
|
||||||
|
|
|
@ -7,7 +7,9 @@ then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
export TARGET=$1
|
export TARGET=$1
|
||||||
export SCRIPT_PATH=$2
|
export STATS_EXT=$2
|
||||||
|
export SCRIPT_PATH=$3
|
||||||
|
|
||||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
|
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
|
||||||
export HADOOP_USER_NAME="oozie"
|
export HADOOP_USER_NAME="oozie"
|
||||||
|
|
||||||
|
@ -15,7 +17,7 @@ echo "Getting file from " $SCRIPT_PATH
|
||||||
hdfs dfs -copyToLocal $SCRIPT_PATH
|
hdfs dfs -copyToLocal $SCRIPT_PATH
|
||||||
|
|
||||||
echo "Creating indicators"
|
echo "Creating indicators"
|
||||||
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||||
hive $HIVE_OPTS -f foo
|
hive $HIVE_OPTS -f foo
|
||||||
hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql
|
hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql
|
||||||
echo "Indicators created"
|
echo "Indicators created"
|
||||||
|
|
|
@ -29,17 +29,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
|
||||||
from rcount
|
from rcount
|
||||||
group by rcount.pid;
|
group by rcount.pid;
|
||||||
|
|
||||||
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
create view ${stats_db_name}.rndexpenditure as select * from ${external_stats_db_name}.rndexpediture;
|
||||||
create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
|
create view ${stats_db_name}.rndgdpexpenditure as select * from ${external_stats_db_name}.rndgdpexpenditure;
|
||||||
create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
|
create view ${stats_db_name}.doctoratestudents as select * from ${external_stats_db_name}.doctoratestudents;
|
||||||
create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
|
create view ${stats_db_name}.totalresearchers as select * from ${external_stats_db_name}.totalresearchers;
|
||||||
create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
|
create view ${stats_db_name}.totalresearchersft as select * from ${external_stats_db_name}.totalresearchersft;
|
||||||
create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
|
create view ${stats_db_name}.hrrst as select * from ${external_stats_db_name}.hrrst;
|
||||||
|
|
||||||
create table ${stats_db_name}.result_instance stored as parquet as
|
create table ${stats_db_name}.result_instance stored as parquet as
|
||||||
select distinct r.*
|
select distinct r.*
|
||||||
from (
|
from (
|
||||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
||||||
join ${stats_db_name}.result res on res.id=r.id;
|
join ${stats_db_name}.result res on res.id=r.id;
|
||||||
|
@ -52,4 +52,4 @@ from (
|
||||||
join ${stats_db_name}.result res on res.id=r.id
|
join ${stats_db_name}.result res on res.id=r.id
|
||||||
where r.amount is not null;
|
where r.amount is not null;
|
||||||
|
|
||||||
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;
|
create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
|
|
@ -180,7 +180,7 @@ from publication_datasources pd
|
||||||
left outer join (
|
left outer join (
|
||||||
select pd.id, 1 as in_diamond_journal from publication_datasources pd
|
select pd.id, 1 as in_diamond_journal from publication_datasources pd
|
||||||
join datasource d on d.id=pd.datasource
|
join datasource d on d.id=pd.datasource
|
||||||
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||||
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
|
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
|
||||||
on pd.id=tmp.id;
|
on pd.id=tmp.id;
|
||||||
|
|
||||||
|
@ -192,7 +192,7 @@ from publication pd
|
||||||
left outer join (
|
left outer join (
|
||||||
select pd.id, 1 as is_transformative from publication_datasources pd
|
select pd.id, 1 as is_transformative from publication_datasources pd
|
||||||
join datasource d on d.id=pd.datasource
|
join datasource d on d.id=pd.datasource
|
||||||
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||||
and ps.is_transformative_journal=true) tmp
|
and ps.is_transformative_journal=true) tmp
|
||||||
on pd.id=tmp.id;
|
on pd.id=tmp.id;
|
||||||
|
|
||||||
|
@ -220,11 +220,11 @@ ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
|
||||||
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
|
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||||
WITH hybrid_oa AS (
|
WITH hybrid_oa AS (
|
||||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||||
FROM stats_ext.plan_s_jn
|
FROM STATS_EXT.plan_s_jn
|
||||||
WHERE issn_print != ""
|
WHERE issn_print != ""
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||||
FROM stats_ext.plan_s_jn
|
FROM STATS_EXT.plan_s_jn
|
||||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||||
issn AS (
|
issn AS (
|
||||||
SELECT *
|
SELECT *
|
||||||
|
@ -291,7 +291,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
|
||||||
journal_is_oa,
|
journal_is_oa,
|
||||||
issn_1 as issn
|
issn_1 as issn
|
||||||
FROM
|
FROM
|
||||||
stats_ext.oa_journals
|
STATS_EXT.oa_journals
|
||||||
WHERE
|
WHERE
|
||||||
issn_1 != ""
|
issn_1 != ""
|
||||||
UNION
|
UNION
|
||||||
|
@ -301,7 +301,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as
|
||||||
journal_is_oa,
|
journal_is_oa,
|
||||||
issn_2 as issn
|
issn_2 as issn
|
||||||
FROM
|
FROM
|
||||||
stats_ext.oa_journals
|
STATS_EXT.oa_journals
|
||||||
WHERE
|
WHERE
|
||||||
issn_2 != "" ), issn AS ( SELECT
|
issn_2 != "" ), issn AS ( SELECT
|
||||||
*
|
*
|
||||||
|
@ -343,7 +343,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
|
||||||
issn_1 as issn,
|
issn_1 as issn,
|
||||||
has_apc
|
has_apc
|
||||||
FROM
|
FROM
|
||||||
stats_ext.oa_journals
|
STATS_EXT.oa_journals
|
||||||
WHERE
|
WHERE
|
||||||
issn_1 != ""
|
issn_1 != ""
|
||||||
UNION
|
UNION
|
||||||
|
@ -354,7 +354,7 @@ create table if not exists indi_pub_hybrid stored as parquet as
|
||||||
issn_2 as issn,
|
issn_2 as issn,
|
||||||
has_apc
|
has_apc
|
||||||
FROM
|
FROM
|
||||||
stats_ext.oa_journals
|
STATS_EXT.oa_journals
|
||||||
WHERE
|
WHERE
|
||||||
issn_2 != "" ), issn AS ( SELECT
|
issn_2 != "" ), issn AS ( SELECT
|
||||||
*
|
*
|
||||||
|
|
|
@ -108,6 +108,7 @@
|
||||||
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
|
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
|
||||||
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
|
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
|
||||||
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
|
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
|
||||||
|
<case to="Step24-updateCache">${wf:conf('resumeFrom') eq 'Step24-updateCache'}</case>
|
||||||
<default to="Step1"/>
|
<default to="Step1"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -289,6 +290,7 @@
|
||||||
<script>scripts/step15_5.sql</script>
|
<script>scripts/step15_5.sql</script>
|
||||||
<param>stats_db_name=${stats_db_name}</param>
|
<param>stats_db_name=${stats_db_name}</param>
|
||||||
<param>openaire_db_name=${openaire_db_name}</param>
|
<param>openaire_db_name=${openaire_db_name}</param>
|
||||||
|
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||||
</hive2>
|
</hive2>
|
||||||
<ok to="Contexts"/>
|
<ok to="Contexts"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -313,6 +315,7 @@
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<exec>indicators.sh</exec>
|
<exec>indicators.sh</exec>
|
||||||
<argument>${stats_db_name}</argument>
|
<argument>${stats_db_name}</argument>
|
||||||
|
<argument>${external_stats_db_name}</argument>
|
||||||
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
|
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
|
||||||
<file>indicators.sh</file>
|
<file>indicators.sh</file>
|
||||||
</shell>
|
</shell>
|
||||||
|
@ -452,21 +455,21 @@
|
||||||
<argument>${observatory_db_shadow_name}</argument>
|
<argument>${observatory_db_shadow_name}</argument>
|
||||||
<file>finalizeImpalaCluster.sh</file>
|
<file>finalizeImpalaCluster.sh</file>
|
||||||
</shell>
|
</shell>
|
||||||
|
<ok to="Step24-updateCache"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="Step24-updateCache">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>updateCache.sh</exec>
|
||||||
|
<argument>${stats_tool_api_url}</argument>
|
||||||
|
<file>updateCache.sh</file>
|
||||||
|
</shell>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<!-- <action name="Step24-updateCache">-->
|
|
||||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
|
||||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
|
||||||
<!-- <name-node>${nameNode}</name-node>-->
|
|
||||||
<!-- <exec>updateCache.sh</exec>-->
|
|
||||||
<!-- <argument>${stats_tool_api_url}</argument>-->
|
|
||||||
<!-- <file>updateCache.sh</file>-->
|
|
||||||
<!-- </shell>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
||||||
|
|
Loading…
Reference in New Issue