forked from D-Net/dnet-hadoop
Changes 06022023
This commit is contained in:
parent
cf58e4a5e4
commit
2dc6d47270
|
@ -6,13 +6,16 @@ then
|
|||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
#export HADOOP_USER_NAME="dimitris.pierrakos"
|
||||
export HADOOP_USER_NAME=$4
|
||||
|
||||
function copydb() {
|
||||
db=$1
|
||||
|
||||
# copy the databases from ocean to impala
|
||||
|
||||
#echo "copying $db"
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
|
||||
|
||||
# change ownership to impala
|
||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db
|
||||
|
@ -48,9 +51,10 @@ function copydb() {
|
|||
STATS_DB=$1
|
||||
MONITOR_DB=$2
|
||||
OBSERVATORY_DB=$3
|
||||
EXT_DB=$4
|
||||
HADOOP_USER_NAME=$4
|
||||
#EXT_DB=$4
|
||||
|
||||
copydb $EXT_DB
|
||||
#copydb $EXT_DB
|
||||
copydb $STATS_DB
|
||||
copydb $MONITOR_DB
|
||||
copydb $OBSERVATORY_DB
|
||||
|
|
|
@ -10,9 +10,10 @@ function createShadowDB() {
|
|||
SOURCE=$1
|
||||
SHADOW=$2
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
}
|
||||
|
||||
STATS_DB=$1
|
||||
|
|
|
@ -12,5 +12,8 @@ export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark
|
|||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Updating shadow database"
|
||||
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
hive -e "drop database if exists ${SHADOW} cascade"
|
||||
hive -e "create database if not exists ${SHADOW}"
|
||||
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Updated shadow database"
|
|
@ -8,7 +8,7 @@ fi
|
|||
|
||||
export TARGET=$1
|
||||
export SCRIPT_PATH=$2
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Getting file from " $SCRIPT_PATH
|
||||
|
|
|
@ -17,9 +17,8 @@ export HADOOP_USER_NAME="oozie"
|
|||
echo "Getting file from " $SCRIPT_PATH
|
||||
hdfs dfs -copyToLocal $SCRIPT_PATH
|
||||
|
||||
|
||||
echo "Creating monitor database"
|
||||
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
|
||||
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo
|
||||
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
echo "Hive shell finished"
|
||||
echo "Hive shell finished"
|
|
@ -10,6 +10,9 @@ export SOURCE=$1
|
|||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
|
||||
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||
hive -f foo
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
echo "Hive shell finished"
|
||||
|
|
|
@ -10,8 +10,11 @@ export SOURCE=$1
|
|||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Creating observatory database"
|
||||
hive -e "drop database if exists ${TARGET} cascade"
|
||||
hive -e "create database if not exists ${TARGET}"
|
||||
hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
|
||||
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
|
||||
hive -f foo
|
||||
|
|
|
@ -60,9 +60,9 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
|
||||
'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech
|
||||
) )) foo;
|
||||
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Graph Stats Hive" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
|
@ -53,6 +53,10 @@
|
|||
<name>context_api_url</name>
|
||||
<description>the base url of the context api (https://services.openaire.eu/openaire)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hadoop_user_name</name>
|
||||
<description>user name of the wf owner</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -74,7 +78,40 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="Step1"/>
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="Step1">${wf:conf('resumeFrom') eq 'Step1'}</case>
|
||||
<case to="Step2">${wf:conf('resumeFrom') eq 'Step2'}</case>
|
||||
<case to="Step3">${wf:conf('resumeFrom') eq 'Step3'}</case>
|
||||
<case to="Step4">${wf:conf('resumeFrom') eq 'Step4'}</case>
|
||||
<case to="Step5">${wf:conf('resumeFrom') eq 'Step5'}</case>
|
||||
<case to="Step6">${wf:conf('resumeFrom') eq 'Step6'}</case>
|
||||
<case to="Step7">${wf:conf('resumeFrom') eq 'Step7'}</case>
|
||||
<case to="Step8">${wf:conf('resumeFrom') eq 'Step8'}</case>
|
||||
<case to="Step9">${wf:conf('resumeFrom') eq 'Step9'}</case>
|
||||
<case to="Step10">${wf:conf('resumeFrom') eq 'Step10'}</case>
|
||||
<case to="Step11">${wf:conf('resumeFrom') eq 'Step11'}</case>
|
||||
<case to="Step12">${wf:conf('resumeFrom') eq 'Step12'}</case>
|
||||
<case to="Step13">${wf:conf('resumeFrom') eq 'Step13'}</case>
|
||||
<case to="Step14">${wf:conf('resumeFrom') eq 'Step14'}</case>
|
||||
<case to="Step15">${wf:conf('resumeFrom') eq 'Step15'}</case>
|
||||
<case to="Step15_5">${wf:conf('resumeFrom') eq 'Step15_5'}</case>
|
||||
<case to="Contexts">${wf:conf('resumeFrom') eq 'Contexts'}</case>
|
||||
<case to="Step16-createIndicatorsTables">${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'}</case>
|
||||
<case to="Step16_1-definitions">${wf:conf('resumeFrom') eq 'Step16_1-definitions'}</case>
|
||||
<case to="Step16_5">${wf:conf('resumeFrom') eq 'Step16_5'}</case>
|
||||
<case to="Step19-finalize">${wf:conf('resumeFrom') eq 'Step19-finalize'}</case>
|
||||
<case to="step20-createMonitorDB">${wf:conf('resumeFrom') eq 'step20-createMonitorDB'}</case>
|
||||
<case to="step21-createObservatoryDB-pre">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'}</case>
|
||||
<case to="step21-createObservatoryDB">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'}</case>
|
||||
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
|
||||
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
|
||||
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
|
||||
<default to="Step1"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -302,22 +339,22 @@
|
|||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step19-finalize"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step19-finalize">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizedb.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${stats_db_shadow_name}</argument>
|
||||
<file>finalizedb.sh</file>
|
||||
</shell>
|
||||
<ok to="step20-createMonitorDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step19-finalize">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>finalizedb.sh</exec>-->
|
||||
<!-- <argument>${stats_db_name}</argument>-->
|
||||
<!-- <argument>${stats_db_shadow_name}</argument>-->
|
||||
<!-- <file>finalizedb.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step20-createMonitorDB"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<action name="step20-createMonitorDB">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
|
@ -355,55 +392,57 @@
|
|||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>observatory_db_name=${observatory_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="End"/>
|
||||
<ok to="step21-createObservatoryDB-post"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="step21-createObservatoryDB-post">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>observatory-post.sh</exec>-->
|
||||
<!-- <argument>${stats_db_name}</argument>-->
|
||||
<!-- <argument>${observatory_db_name}</argument>-->
|
||||
<!-- <argument>${observatory_db_shadow_name}</argument>-->
|
||||
<!-- <file>observatory-post.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step22-copyDataToImpalaCluster"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
<action name="step21-createObservatoryDB-post">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>observatory-post.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>observatory-post.sh</file>
|
||||
</shell>
|
||||
<ok to="step22-copyDataToImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="step22-copyDataToImpalaCluster">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>copyDataToImpalaCluster.sh</exec>-->
|
||||
<action name="step22-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
|
||||
<!-- <argument>${external_stats_db_name}</argument>-->
|
||||
<!-- <argument>${stats_db_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_name}</argument>-->
|
||||
<!-- <argument>${observatory_db_name}</argument>-->
|
||||
<!-- <file>copyDataToImpalaCluster.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step23-finalizeImpalaCluster"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${hadoop_user_name}</argument>
|
||||
<file>copyDataToImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="step23-finalizeImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="step23-finalizeImpalaCluster">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>finalizeImpalaCluster.sh</exec>-->
|
||||
<!-- <argument>${stats_db_name}</argument>-->
|
||||
<!-- <argument>${stats_db_shadow_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_shadow_name}</argument>-->
|
||||
<!-- <argument>${observatory_db_name}</argument>-->
|
||||
<!-- <argument>${observatory_db_shadow_name}</argument>-->
|
||||
<!-- <file>finalizeImpalaCluster.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
<action name="step23-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${stats_db_shadow_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step24-updateCache">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
|
|
Loading…
Reference in New Issue