diff --git a/dhp-workflows/dhp-indicators/nb-configuration.xml b/dhp-workflows/dhp-indicators/nb-configuration.xml new file mode 100644 index 000000000..a65c4514a --- /dev/null +++ b/dhp-workflows/dhp-indicators/nb-configuration.xml @@ -0,0 +1,18 @@ + + + + + + JDK_1.8 + + diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml new file mode 100755 index 000000000..72ad153f1 --- /dev/null +++ b/dhp-workflows/dhp-indicators/pom.xml @@ -0,0 +1,107 @@ + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + ../ + + 4.0.0 + dhp-indicators + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + 2.7.4 + jar + + + eu.dnetlib.dhp + dhp-common + 1.1.7-SNAPSHOT + jar + + + com.mchange + c3p0 + 0.9.5.2 + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + org.slf4j + slf4j-api + 1.7.26 + jar + + + dhp-indicators + diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh new file mode 100755 index 000000000..0cad5792d --- /dev/null +++ b/dhp-workflows/dhp-indicators/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml new file mode 100644 index 000000000..6d255a7f4 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql new file mode 100644 index 000000000..0a96063cb --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -0,0 +1,7 @@ +create table TARGET.funders_publications stored as parquet as +select f.id as id, count(pr.result) as total_pubs from SOURCE.funder f +join SOURCE.project p on f.name=p.funder +join SOURCE.project_results_publication pr on pr.project_results=p.id group by f.id, f.name; + + +compute stats TARGET.funders_publications; \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh new file mode 100644 index 000000000..306609e8a --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh @@ -0,0 +1,29 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating indicators database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat createIndicatorsTables.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Indicators Database created" + + +echo "Updating Shadow indicators DB" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Indicators Shadow DB ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml new file mode 100644 index 000000000..ec917b9a4 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml @@ -0,0 +1,101 @@ + + + + stats_db_name + the source stats database name + + + indicators_db_name + the target indicators database name + + + indicators_shadow_db_name + the name of the shadow schema + + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + + + + + + + ${jobTracker} + ${nameNode} + indicators.sh + ${stats_db_name} + ${indicators_db_name} + ${indicators_shadow_db_name} + ${wf:appPath()}/scripts/createIndicatorsTables.sql + scripts/indicators.sh + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + \ No newline at end of file