Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
9 changed files with 127 additions and 75 deletions
Showing only changes of commit 2dc6d47270 - Show all commits

View File

@ -6,13 +6,16 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi fi
#export HADOOP_USER_NAME="dimitris.pierrakos"
export HADOOP_USER_NAME=$4
function copydb() { function copydb() {
db=$1 db=$1
# copy the databases from ocean to impala # copy the databases from ocean to impala
#echo "copying $db" #echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn2.openaire.eu:8020/tmp hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp
# change ownership to impala # change ownership to impala
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/${db}.db
@ -48,9 +51,10 @@ function copydb() {
STATS_DB=$1 STATS_DB=$1
MONITOR_DB=$2 MONITOR_DB=$2
OBSERVATORY_DB=$3 OBSERVATORY_DB=$3
EXT_DB=$4 HADOOP_USER_NAME=$4
#EXT_DB=$4
copydb $EXT_DB #copydb $EXT_DB
copydb $STATS_DB copydb $STATS_DB
copydb $MONITOR_DB copydb $MONITOR_DB
copydb $OBSERVATORY_DB copydb $OBSERVATORY_DB

View File

@ -10,9 +10,10 @@ function createShadowDB() {
SOURCE=$1 SOURCE=$1
SHADOW=$2 SHADOW=$2
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - # impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
} }
STATS_DB=$1 STATS_DB=$1

View File

@ -12,5 +12,8 @@ export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark
export HADOOP_USER_NAME="oozie" export HADOOP_USER_NAME="oozie"
echo "Updating shadow database" echo "Updating shadow database"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo hive -e "drop database if exists ${SHADOW} cascade"
hive $HIVE_OPTS -f foo hive -e "create database if not exists ${SHADOW}"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo
echo "Updated shadow database"

View File

@ -8,7 +8,7 @@ fi
export TARGET=$1 export TARGET=$1
export SCRIPT_PATH=$2 export SCRIPT_PATH=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
export HADOOP_USER_NAME="oozie" export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH echo "Getting file from " $SCRIPT_PATH

View File

@ -17,9 +17,8 @@ export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating monitor database" echo "Creating monitor database"
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo #cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo
hive $HIVE_OPTS -f foo hive $HIVE_OPTS -f foo
echo "Hive shell finished" echo "Hive shell finished"

View File

@ -10,6 +10,9 @@ export SOURCE=$1
export TARGET=$2 export TARGET=$2
export SHADOW=$3 export SHADOW=$3
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
hive -f foo export HADOOP_USER_NAME="oozie"
hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished" echo "Hive shell finished"

View File

@ -10,8 +10,11 @@ export SOURCE=$1
export TARGET=$2 export TARGET=$2
export SHADOW=$3 export SHADOW=$3
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Creating observatory database" echo "Creating observatory database"
hive -e "drop database if exists ${TARGET} cascade" hive -e "drop database if exists ${TARGET} cascade"
hive -e "create database if not exists ${TARGET}" hive -e "create database if not exists ${TARGET}"
hive --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo hive -f foo

View File

@ -60,9 +60,9 @@ create table TARGET.result stored as parquet as
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech -- 'openorgs____::57784c9e047e826fefdb1ef816120d92' --Arts et Métiers ParisTech
) )) foo; ) )) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS; ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -1,4 +1,4 @@
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Graph Stats Hive" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>stats_db_name</name> <name>stats_db_name</name>
@ -53,6 +53,10 @@
<name>context_api_url</name> <name>context_api_url</name>
<description>the base url of the context api (https://services.openaire.eu/openaire)</description> <description>the base url of the context api (https://services.openaire.eu/openaire)</description>
</property> </property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters> </parameters>
<global> <global>
@ -74,7 +78,40 @@
</configuration> </configuration>
</global> </global>
<start to="Step1"/> <start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1">${wf:conf('resumeFrom') eq 'Step1'}</case>
<case to="Step2">${wf:conf('resumeFrom') eq 'Step2'}</case>
<case to="Step3">${wf:conf('resumeFrom') eq 'Step3'}</case>
<case to="Step4">${wf:conf('resumeFrom') eq 'Step4'}</case>
<case to="Step5">${wf:conf('resumeFrom') eq 'Step5'}</case>
<case to="Step6">${wf:conf('resumeFrom') eq 'Step6'}</case>
<case to="Step7">${wf:conf('resumeFrom') eq 'Step7'}</case>
<case to="Step8">${wf:conf('resumeFrom') eq 'Step8'}</case>
<case to="Step9">${wf:conf('resumeFrom') eq 'Step9'}</case>
<case to="Step10">${wf:conf('resumeFrom') eq 'Step10'}</case>
<case to="Step11">${wf:conf('resumeFrom') eq 'Step11'}</case>
<case to="Step12">${wf:conf('resumeFrom') eq 'Step12'}</case>
<case to="Step13">${wf:conf('resumeFrom') eq 'Step13'}</case>
<case to="Step14">${wf:conf('resumeFrom') eq 'Step14'}</case>
<case to="Step15">${wf:conf('resumeFrom') eq 'Step15'}</case>
<case to="Step15_5">${wf:conf('resumeFrom') eq 'Step15_5'}</case>
<case to="Contexts">${wf:conf('resumeFrom') eq 'Contexts'}</case>
<case to="Step16-createIndicatorsTables">${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'}</case>
<case to="Step16_1-definitions">${wf:conf('resumeFrom') eq 'Step16_1-definitions'}</case>
<case to="Step16_5">${wf:conf('resumeFrom') eq 'Step16_5'}</case>
<case to="Step19-finalize">${wf:conf('resumeFrom') eq 'Step19-finalize'}</case>
<case to="step20-createMonitorDB">${wf:conf('resumeFrom') eq 'step20-createMonitorDB'}</case>
<case to="step21-createObservatoryDB-pre">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'}</case>
<case to="step21-createObservatoryDB">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'}</case>
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
<default to="Step1"/>
</switch>
</decision>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -302,22 +339,22 @@
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param> <param>openaire_db_name=${openaire_db_name}</param>
</hive2> </hive2>
<ok to="step20-createMonitorDB"/> <ok to="Step19-finalize"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="Step19-finalize">--> <action name="Step19-finalize">
<!-- <shell xmlns="uri:oozie:shell-action:0.1">--> <shell xmlns="uri:oozie:shell-action:0.1">
<!-- <job-tracker>${jobTracker}</job-tracker>--> <job-tracker>${jobTracker}</job-tracker>
<!-- <name-node>${nameNode}</name-node>--> <name-node>${nameNode}</name-node>
<!-- <exec>finalizedb.sh</exec>--> <exec>finalizedb.sh</exec>
<!-- <argument>${stats_db_name}</argument>--> <argument>${stats_db_name}</argument>
<!-- <argument>${stats_db_shadow_name}</argument>--> <argument>${stats_db_shadow_name}</argument>
<!-- <file>finalizedb.sh</file>--> <file>finalizedb.sh</file>
<!-- </shell>--> </shell>
<!-- <ok to="step20-createMonitorDB"/>--> <ok to="step20-createMonitorDB"/>
<!-- <error to="Kill"/>--> <error to="Kill"/>
<!-- </action>--> </action>
<action name="step20-createMonitorDB"> <action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
@ -355,55 +392,57 @@
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>observatory_db_name=${observatory_db_name}</param> <param>observatory_db_name=${observatory_db_name}</param>
</hive2> </hive2>
<ok to="End"/> <ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="step21-createObservatoryDB-post">--> <action name="step21-createObservatoryDB-post">
<!-- <shell xmlns="uri:oozie:shell-action:0.1">--> <shell xmlns="uri:oozie:shell-action:0.1">
<!-- <job-tracker>${jobTracker}</job-tracker>--> <job-tracker>${jobTracker}</job-tracker>
<!-- <name-node>${nameNode}</name-node>--> <name-node>${nameNode}</name-node>
<!-- <exec>observatory-post.sh</exec>--> <exec>observatory-post.sh</exec>
<!-- <argument>${stats_db_name}</argument>--> <argument>${stats_db_name}</argument>
<!-- <argument>${observatory_db_name}</argument>--> <argument>${observatory_db_name}</argument>
<!-- <argument>${observatory_db_shadow_name}</argument>--> <argument>${observatory_db_shadow_name}</argument>
<!-- <file>observatory-post.sh</file>--> <file>observatory-post.sh</file>
<!-- </shell>--> </shell>
<!-- <ok to="step22-copyDataToImpalaCluster"/>--> <ok to="step22-copyDataToImpalaCluster"/>
<!-- <error to="Kill"/>--> <error to="Kill"/>
<!-- </action>--> </action>
<!-- <action name="step22-copyDataToImpalaCluster">--> <action name="step22-copyDataToImpalaCluster">
<!-- <shell xmlns="uri:oozie:shell-action:0.1">--> <shell xmlns="uri:oozie:shell-action:0.1">
<!-- <job-tracker>${jobTracker}</job-tracker>--> <job-tracker>${jobTracker}</job-tracker>
<!-- <name-node>${nameNode}</name-node>--> <name-node>${nameNode}</name-node>
<!-- <exec>copyDataToImpalaCluster.sh</exec>--> <exec>copyDataToImpalaCluster.sh</exec>
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
<!-- <argument>${external_stats_db_name}</argument>--> <!-- <argument>${external_stats_db_name}</argument>-->
<!-- <argument>${stats_db_name}</argument>--> <argument>${stats_db_name}</argument>
<!-- <argument>${monitor_db_name}</argument>--> <argument>${monitor_db_name}</argument>
<!-- <argument>${observatory_db_name}</argument>--> <argument>${observatory_db_name}</argument>
<!-- <file>copyDataToImpalaCluster.sh</file>--> <argument>${hadoop_user_name}</argument>
<!-- </shell>--> <file>copyDataToImpalaCluster.sh</file>
<!-- <ok to="step23-finalizeImpalaCluster"/>--> </shell>
<!-- <error to="Kill"/>--> <ok to="step23-finalizeImpalaCluster"/>
<!-- </action>--> <error to="Kill"/>
</action>
<!-- <action name="step23-finalizeImpalaCluster">--> <action name="step23-finalizeImpalaCluster">
<!-- <shell xmlns="uri:oozie:shell-action:0.1">--> <shell xmlns="uri:oozie:shell-action:0.1">
<!-- <job-tracker>${jobTracker}</job-tracker>--> <job-tracker>${jobTracker}</job-tracker>
<!-- <name-node>${nameNode}</name-node>--> <name-node>${nameNode}</name-node>
<!-- <exec>finalizeImpalaCluster.sh</exec>--> <exec>finalizeImpalaCluster.sh</exec>
<!-- <argument>${stats_db_name}</argument>--> <argument>${stats_db_name}</argument>
<!-- <argument>${stats_db_shadow_name}</argument>--> <argument>${stats_db_shadow_name}</argument>
<!-- <argument>${monitor_db_name}</argument>--> <argument>${monitor_db_name}</argument>
<!-- <argument>${monitor_db_shadow_name}</argument>--> <argument>${monitor_db_shadow_name}</argument>
<!-- <argument>${observatory_db_name}</argument>--> <argument>${observatory_db_name}</argument>
<!-- <argument>${observatory_db_shadow_name}</argument>--> <argument>${observatory_db_shadow_name}</argument>
<!-- <file>finalizeImpalaCluster.sh</file>--> <file>finalizeImpalaCluster.sh</file>
<!-- </shell>--> </shell>
<!-- <ok to="End"/>--> <ok to="End"/>
<!-- <error to="Kill"/>--> <error to="Kill"/>
<!-- </action>--> </action>
<!-- <action name="Step24-updateCache">--> <!-- <action name="Step24-updateCache">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">--> <!-- <shell xmlns="uri:oozie:shell-action:0.1">-->