diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml new file mode 100644 index 000000000..63fc84d75 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + stats_tool_api_url + ${stats_tool_api_url} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh new file mode 100644 index 000000000..b8c71681a --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor-post.sh @@ -0,0 +1,21 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow monitor database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh new file mode 100644 index 000000000..f39bf4893 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/monitor.sh @@ -0,0 +1,24 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH + +echo "Creating monitor database" +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" \ No newline at end of file diff --git a/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml new file mode 100644 index 000000000..7e4cfc759 --- /dev/null +++ b/dhp-workflows/dhp-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/monitor/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + stats_db_name + the target stats database name + + + stats_db_shadow_name + the name of the shadow schema + + + monitor_db_name + the target monitor db name + + + monitor_db_shadow_name + the name of the shadow monitor db + + + stats_tool_api_url + The url of the API of the stats tool. Is used to trigger the cache update. + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + monitor.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/createMonitorDB.sql + monitor.sh + + + + + + + + + ${jobTracker} + ${nameNode} + monitor-post.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + monitor-post.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + + + + + + + \ No newline at end of file