Changes 06022023

This commit is contained in:
dimitrispie 2023-02-06 13:18:53 +02:00
parent cf58e4a5e4
commit 2dc6d47270
9 changed files with 127 additions and 75 deletions

View File

@ -6,13 +6,16 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
#export HADOOP_USER_NAME="dimitris.pierrakos"
export HADOOP_USER_NAME=$4
function copydb() {
db=$1
# copy the databases from ocean to impala
#echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
# change ownership to impala
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db
@ -48,9 +51,10 @@ function copydb() {
STATS_DB=$1
MONITOR_DB=$2
OBSERVATORY_DB=$3
EXT_DB=$4
HADOOP_USER_NAME=$4
#EXT_DB=$4
copydb $EXT_DB
#copydb $EXT_DB
copydb $STATS_DB
copydb $MONITOR_DB
copydb $OBSERVATORY_DB

View File

@ -10,9 +10,10 @@ function createShadowDB() {
SOURCE=$1
SHADOW=$2
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
}
STATS_DB=$1

View File

@ -12,5 +12,8 @@ export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark
export HADOOP_USER_NAME="oozie"
echo "Updating shadow database"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
hive $HIVE_OPTS -f foo
hive -e "drop database if exists ${SHADOW} cascade"
hive -e "create database if not exists ${SHADOW}"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo
echo "Updated shadow database"

View File

@ -8,7 +8,7 @@ fi
export TARGET=$1
export SCRIPT_PATH=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH

View File

@ -17,9 +17,8 @@ export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating monitor database"
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"
echo "Hive shell finished"

View File

@ -10,6 +10,9 @@ export SOURCE=$1
export TARGET=$2
export SHADOW=$3
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive -f foo
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -10,8 +10,11 @@ export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Creating observatory database"
hive -e "drop database if exists ${TARGET} cascade"
hive -e "create database if not exists ${TARGET}"
hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo

View File

@ -60,9 +60,9 @@ create table TARGET.result stored as parquet as
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech
) )) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -1,4 +1,4 @@
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Graph Stats Hive" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>
@ -53,6 +53,10 @@
<name>context_api_url</name>
<description>the base url of the context api (https://services.openaire.eu/openaire)</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
@ -74,7 +78,40 @@
</configuration>
</global>
<start to="Step1"/>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1">${wf:conf('resumeFrom') eq 'Step1'}</case>
<case to="Step2">${wf:conf('resumeFrom') eq 'Step2'}</case>
<case to="Step3">${wf:conf('resumeFrom') eq 'Step3'}</case>
<case to="Step4">${wf:conf('resumeFrom') eq 'Step4'}</case>
<case to="Step5">${wf:conf('resumeFrom') eq 'Step5'}</case>
<case to="Step6">${wf:conf('resumeFrom') eq 'Step6'}</case>
<case to="Step7">${wf:conf('resumeFrom') eq 'Step7'}</case>
<case to="Step8">${wf:conf('resumeFrom') eq 'Step8'}</case>
<case to="Step9">${wf:conf('resumeFrom') eq 'Step9'}</case>
<case to="Step10">${wf:conf('resumeFrom') eq 'Step10'}</case>
<case to="Step11">${wf:conf('resumeFrom') eq 'Step11'}</case>
<case to="Step12">${wf:conf('resumeFrom') eq 'Step12'}</case>
<case to="Step13">${wf:conf('resumeFrom') eq 'Step13'}</case>
<case to="Step14">${wf:conf('resumeFrom') eq 'Step14'}</case>
<case to="Step15">${wf:conf('resumeFrom') eq 'Step15'}</case>
<case to="Step15_5">${wf:conf('resumeFrom') eq 'Step15_5'}</case>
<case to="Contexts">${wf:conf('resumeFrom') eq 'Contexts'}</case>
<case to="Step16-createIndicatorsTables">${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'}</case>
<case to="Step16_1-definitions">${wf:conf('resumeFrom') eq 'Step16_1-definitions'}</case>
<case to="Step16_5">${wf:conf('resumeFrom') eq 'Step16_5'}</case>
<case to="Step19-finalize">${wf:conf('resumeFrom') eq 'Step19-finalize'}</case>
<case to="step20-createMonitorDB">${wf:conf('resumeFrom') eq 'step20-createMonitorDB'}</case>
<case to="step21-createObservatoryDB-pre">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'}</case>
<case to="step21-createObservatoryDB">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'}</case>
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
<default to="Step1"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -302,22 +339,22 @@
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step19-finalize"/>
<error to="Kill"/>
</action>
<action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<file>finalizedb.sh</file>
</shell>
<ok to="step20-createMonitorDB"/>
<error to="Kill"/>
</action>
<!-- <action name="Step19-finalize">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizedb.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <file>finalizedb.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step20-createMonitorDB"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
@ -355,55 +392,57 @@
<param>stats_db_name=${stats_db_name}</param>
<param>observatory_db_name=${observatory_db_name}</param>
</hive2>
<ok to="End"/>
<ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/>
</action>
<!-- <action name="step21-createObservatoryDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>observatory-post.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>observatory-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step22-copyDataToImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>observatory-post.sh</file>
</shell>
<ok to="step22-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<!-- <action name="step22-copyDataToImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>copyDataToImpalaCluster.sh</exec>-->
<action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <file>copyDataToImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step23-finalizeImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="step23-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<!-- <action name="step23-finalizeImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizeImpalaCluster.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>finalizeImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<!-- <action name="Step24-updateCache">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->