From 75bfde043cd131c84055f0e0ca93d03e4a9e2d13 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 4 Jan 2024 15:11:04 +0200 Subject: [PATCH] Historical Snapshots Workflow Create historical snapshots db with parameters: hist_db_name=openaire_beta_historical_snapshots_xxx hist_db_name_prev=openaire_beta_historical_snapshots_xxx (previous run of wf) stats_db_name=openaire_beta_stats_xxx stats_irish_db_name=openaire_beta_stats_monitor_ie_xxx monitor_db_name=openaire_beta_stats_monitor_xxx monitor_db_prod_name=openaire_beta_stats_monitor monitor_irish_db_name=openaire_beta_stats_monitor_ie_xxx monitor_irish_db_prod_name=openaire_beta_stats_monitor_ie hist_db_prod_name=openaire_beta_historical_snapshots hist_db_shadow_name=openaire_beta_historical_snapshots_shadow hist_date=122023 hive_timeout=150000 hadoop_user_name=xxx resumeFrom=CreateDB --- dhp-workflows/dhp-stats-hist-snaps/pom.xml | 32 ++++ .../oozie_app/config-default.xml | 30 ++++ .../oozie_app/copyDataToImpalaCluster.sh | 71 ++++++++ .../oozie_app/finalizeImpalaCluster.sh | 43 +++++ .../stats-hist-snaps/oozie_app/hist_snaps.sh | 27 +++ .../oozie_app/scripts/BuildHistSnapsAll.sql | 82 +++++++++ .../oozie_app/scripts/BuildHistSnapsIrish.sql | 91 ++++++++++ .../oozie_app/scripts/CreateDB.sql | 92 ++++++++++ .../stats-hist-snaps/oozie_app/workflow.xml | 159 ++++++++++++++++++ 9 files changed, 627 insertions(+) create mode 100644 dhp-workflows/dhp-stats-hist-snaps/pom.xml create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql create mode 100644 dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-stats-hist-snaps/pom.xml b/dhp-workflows/dhp-stats-hist-snaps/pom.xml new file mode 100644 index 000000000..08ddcef49 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/pom.xml @@ -0,0 +1,32 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-stats-hist-snaps + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + false + + + + + diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml new file mode 100644 index 000000000..b2a1322e6 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..ef9bb9495 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,71 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export HADOOP_USER_NAME=$2 + +function copydb() { + + + db=$1 + FILE=("hive_wf_tmp_"$RANDOM) + hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + # change ownership to impala +# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + + + # copy the databases from ocean to impala + echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + + # drop tables from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; + done + + # drop views from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; + done + + # delete the database + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + impala-shell -q "INVALIDATE METADATA" + echo "creating schema for ${db}" + for (( k = 0; k < 5; k ++ )); do + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + done + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs +hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db +} + +MONITOR_DB=$1 +#HADOOP_USER_NAME=$2 +copydb $MONITOR_DB + diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..e2277083d --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,43 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +SOURCE=$1 +PRODUCTION=$2 +SHADOW=$3 +MONITOR=$4 +MONITOR_PROD=$5 +MONITOR_IRISH_PROD=$6 +MONITOR_IRISH=$7 + + +echo ${SOURCE} +echo ${PRODUCTION} + +#echo "Updating ${PRODUCTION} monitor database old cluster" +#impala-shell -q "create database if not exists ${PRODUCTION}" +#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - + +echo "Updating ${PRODUCTION} historical snapshots database" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - +echo "Production monitor db ready!" + +impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots" +impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots_fos" + +impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots as select * from ${SOURCE}.historical_snapshots" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots_fos as select * from ${SOURCE}.historical_snapshots_fos" + +impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish" +impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos" + + +impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish as select * from ${SOURCE}.historical_snapshots_irish" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos as select * from ${SOURCE}.historical_snapshots_irish" diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh new file mode 100644 index 000000000..bcaa7984c --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh @@ -0,0 +1,27 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +#update Monitor DB IRISH +#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo +cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo +hive $HIVE_OPTS -f foo + +echo "Hive shell finished" + diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql new file mode 100644 index 000000000..93d804820 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql @@ -0,0 +1,82 @@ +INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp +SELECT * FROM ${hist_db_name_prev}.historical_snapshots_fos; + +INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp +select + cast(${hist_date} as STRING), + count(distinct r.id), + r.type, + rf.lvl1, + rf.lvl2, + pf.publicly_funded, + r.access_mode, + r.gold, + r.green, + coalesce(gl.green_with_license,0), + h.is_hybrid, + b.is_bronze_oa, + d.in_diamond_journal, + t.is_transformative, + pr.refereed +from ${stats_db_name}.result r + left outer join ${stats_db_name}.result_fos rf on rf.id=r.id + left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id + left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id + left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id + left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id + left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id + left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id + left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id +group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed; + +drop table if exists ${hist_db_name}.historical_snapshots_fos purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_fos STORED AS PARQUET AS +SELECT * FROM ${hist_db_name}.historical_snapshots_fos_tmp; + +drop table if exists ${monitor_db_name}.historical_snapshots_fos purge; + +create table ${monitor_db_name}.historical_snapshots_fos stored as parquet +as select * from ${hist_db_name}.historical_snapshots_fos; + +drop table ${hist_db_name}.historical_snapshots_fos_tmp purge; + +INSERT INTO ${hist_db_name}.historical_snapshots_tmp as +SELECT * FROM ${hist_db_name_prev}.historical_snapshots; + +INSERT INTO ${hist_db_name}.historical_snapshots_tmp +select + cast(${hist_date} as STRING), + count(distinct r.id), + r.type, + pf.publicly_funded, + r.access_mode, + r.gold, + r.green, + coalesce(gl.green_with_license,0), + h.is_hybrid, + b.is_bronze_oa, + d.in_diamond_journal, + t.is_transformative, + pr.refereed +from ${stats_db_name}.result r + left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id + left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id + left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id + left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id + left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id + left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id + left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id +group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed; + +drop table if exists ${hist_db_name}.historical_snapshots purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots STORED AS PARQUET AS +SELECT * FROM ${hist_db_name}.historical_snapshots_tmp; + +drop table if exists ${monitor_db_name}.historical_snapshots purge; + +create table ${monitor_db_name}.historical_snapshots stored as parquet +as select * from ${hist_db_name}.historical_snapshots; + +drop table ${hist_db_name}.historical_snapshots_tmp purge; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql new file mode 100644 index 000000000..95e811f64 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql @@ -0,0 +1,91 @@ +INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp +SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish_fos; + +INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp +select + cast(${hist_date} as STRING), + count(distinct r.id), + r.type, + rf.lvl1, + rf.lvl2, + pf.publicly_funded, + r.access_mode, + r.gold, + r.green, + coalesce(gl.green_with_license,0), + h.is_hybrid, + b.is_bronze_oa, + d.in_diamond_journal, + t.is_transformative, + pr.refereed +from ${stats_irish_db_name}.result r + left outer join ${stats_irish_db_name}.result_fos rf on rf.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id + left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id +group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed; + +drop table if exists ${hist_db_name}.historical_snapshots_irish_fos purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_irish_fos STORED AS PARQUET AS +SELECT * FROM ${hist_db_name}.historical_snapshots_fos_irish_tmp; + +drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge; + +create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet +as select * from ${hist_db_name}.historical_snapshots_irish_fos; + +drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge; + +INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp +SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish; + +INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp +select + cast(${hist_date} as STRING), + count(distinct r.id), + r.type, + pf.publicly_funded, + r.access_mode, + r.gold, + r.green, + coalesce(gl.green_with_license,0), + h.is_hybrid, + b.is_bronze_oa, + d.in_diamond_journal, + t.is_transformative, + pr.refereed +from ${stats_irish_db_name}.result r + left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id + left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id + left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id +group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed; + + +drop table if exists ${hist_db_name}.historical_snapshots_irish purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_irish STORED AS PARQUET AS +SELECT * FROM ${hist_db_name}.historical_snapshots_irish_tmp; + +drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish purge; + +create table ${monitor_irish_db_name}.historical_snapshots_irish stored as parquet +as select * from ${hist_db_name}.historical_snapshots_irish; + +drop table ${hist_db_name}.historical_snapshots_irish_tmp purge; + + +drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge; + +create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet +as select * from ${hist_db_name}.historical_snapshots_irish_fos; + +drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql new file mode 100644 index 000000000..18af135bf --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql @@ -0,0 +1,92 @@ +-------------------------------------------------------------- +-------------------------------------------------------------- +-- Historical Snapshots database creation +-------------------------------------------------------------- +-------------------------------------------------------------- + +DROP database IF EXISTS ${hist_db_name} CASCADE; +CREATE database ${hist_db_name}; + +drop table if exists ${hist_db_name}.historical_snapshots_fos_tmp purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_fos_tmp +( + hist_date STRING, + total INT, + type STRING, + lvl1 STRING, + lvl2 STRING, + publicly_funded INT, + accessrights STRING, + gold INT, + green INT, + green_with_license INT, + hybrid INT, + bronze INT, + diamond INT, + transformative INT, + peer_reviewed STRING +) +CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true'); + +drop table if exists ${hist_db_name}.historical_snapshots_fos_irish_tmp purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_fos_irish_tmp +( + hist_date STRING, + total INT, + type STRING, + lvl1 STRING, + lvl2 STRING, + publicly_funded INT, + accessrights STRING, + gold INT, + green INT, + green_with_license INT, + hybrid INT, + bronze INT, + diamond INT, + transformative INT, + peer_reviewed STRING +) +CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true'); + +drop table if exists ${hist_db_name}.historical_snapshots_tmp purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_tmp +( + hist_date STRING, + total INT, + type STRING, + publicly_funded INT, + accessrights STRING, + gold INT, + green INT, + green_with_license INT, + hybrid INT, + bronze INT, + diamond INT, + transformative INT, + peer_reviewed STRING +) +CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true'); + +drop table if exists ${hist_db_name}.historical_snapshots_irish_tmp purge; + +CREATE TABLE ${hist_db_name}.historical_snapshots_irish_tmp +( + hist_date STRING, + total INT, + type STRING, + publicly_funded INT, + accessrights STRING, + gold INT, + green INT, + green_with_license INT, + hybrid INT, + bronze INT, + diamond INT, + transformative INT, + peer_reviewed STRING +) +CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true'); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml new file mode 100644 index 000000000..3245e0859 --- /dev/null +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml @@ -0,0 +1,159 @@ + + + + hist_db_name + the target hist database name + + + hist_db_name_prev + the hist database name of previous_month + + + + stats_db_name + the stats db name + + + stats_irish_db_name + the stats irish db name + + + monitor_db_name + the monitor db name + + + monitor_irish_db_name + the irish monitor db name + + + hist_db_prod_name + the production db + + + hist_db_shadow_name + the production shadow db + + + hist_date + the snaps date + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + hadoop_user_name + user name of the wf owner + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + mapred.job.queue.name + analytics + + + + + + + + ${wf:conf('resumeFrom') eq 'CreateDB'} + ${wf:conf('resumeFrom') eq 'BuildHistSnaps'} + ${wf:conf('resumeFrom') eq 'BuildHistSnapsIrish'} + ${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${hive_jdbc_url} + + hist_db_name=${hist_db_name} + + + + + + + + ${hive_jdbc_url} + + hist_db_name=${hist_db_name} + hist_db_name_prev=${hist_db_name_prev} + stats_db_name=${stats_db_name} + monitor_db_name=${monitor_db_name} + hist_date=${hist_date} + + + + + + + + ${hive_jdbc_url} + + hist_db_name=${hist_db_name} + hist_db_name_prev=${hist_db_name_prev} + stats_irish_db_name=${stats_irish_db_name} + monitor_irish_db_name=${monitor_irish_db_name} + hist_date=${hist_date} + + + + + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${hist_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${hist_db_name} + ${hist_db_prod_name} + ${monitor_db_name} + ${monitor_db_name} + ${monitor_irish_db_name} + finalizeImpalaCluster.sh + + + + + + +