Add indicators wf
This commit is contained in:
parent
c90dd653c2
commit
da1f123d7b
|
@ -0,0 +1,18 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project-shared-configuration>
|
||||
<!--
|
||||
This file contains additional configuration written by modules in the NetBeans IDE.
|
||||
The configuration is intended to be shared among all the users of project and
|
||||
therefore it is assumed to be part of version control checkout.
|
||||
Without this configuration present, some functionality in the IDE may be limited or fail altogether.
|
||||
-->
|
||||
<properties xmlns="http://www.netbeans.org/ns/maven-properties-data/1">
|
||||
<!--
|
||||
Properties that influence various parts of the IDE, especially code formatting and the like.
|
||||
You can copy and paste the single properties, into the pom.xml file and the IDE will pick them up.
|
||||
That way multiple projects can share the same settings (useful for formatting rules for example).
|
||||
Any value defined here will override the pom.xml file value but is only applicable to the current project.
|
||||
-->
|
||||
<netbeans.hint.jdkPlatform>JDK_1.8</netbeans.hint.jdkPlatform>
|
||||
</properties>
|
||||
</project-shared-configuration>
|
|
@ -0,0 +1,107 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-indicators</artifactId>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<version>2.1.15</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>revision</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
|
||||
<!-- more config here as you see fit -->
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.6.1</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
|
||||
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
<version>2.2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
<version>2.4.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.googlecode.json-simple</groupId>
|
||||
<artifactId>json-simple</artifactId>
|
||||
<version>1.1.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.json</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
<version>20180130</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hive</groupId>
|
||||
<artifactId>hive-jdbc</artifactId>
|
||||
<version>${cdh.hive.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<version>2.7.4</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.mchange</groupId>
|
||||
<artifactId>c3p0</artifactId>
|
||||
<version>0.9.5.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>c3p0</groupId>
|
||||
<artifactId>c3p0</artifactId>
|
||||
<version>0.9.1.2</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.26</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<name>dhp-indicators</name>
|
||||
</project>
|
|
@ -0,0 +1 @@
|
|||
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators
|
|
@ -0,0 +1,34 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>${jobTracker}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>${nameNode}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
<!-- <property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<value>${stats_tool_api_url}</value>
|
||||
</property>-->
|
||||
</configuration>
|
|
@ -0,0 +1,7 @@
|
|||
create table TARGET.funders_publications stored as parquet as
|
||||
select f.id as id, count(pr.result) as total_pubs from SOURCE.funder f
|
||||
join SOURCE.project p on f.name=p.funder
|
||||
join SOURCE.project_results_publication pr on pr.project_results=p.id group by f.id, f.name;
|
||||
|
||||
|
||||
compute stats TARGET.funders_publications;
|
|
@ -0,0 +1,29 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
echo "Creating indicators database"
|
||||
impala-shell -q "drop database if exists ${TARGET} cascade"
|
||||
impala-shell -q "create database if not exists ${TARGET}"
|
||||
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
|
||||
cat createIndicatorsTables.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f -
|
||||
echo "Indicators Database created"
|
||||
|
||||
|
||||
echo "Updating Shadow indicators DB"
|
||||
impala-shell -q "create database if not exists ${SHADOW}"
|
||||
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
|
||||
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
|
||||
echo "Indicators Shadow DB ready!"
|
|
@ -0,0 +1,101 @@
|
|||
<workflow-app name="Indicators" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
<description>the source stats database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>indicators_db_name</name>
|
||||
<description>the target indicators database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>indicators_shadow_db_name</name>
|
||||
<description>the name of the shadow schema</description>
|
||||
</property>
|
||||
<!-- <property>
|
||||
<name>openaire_db_name</name>
|
||||
<description>the original graph database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>external_stats_db_name</name>
|
||||
<value>stats_ext</value>
|
||||
<description>the external stats that should be added since they are not included in the graph database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_db_shadow_name</name>
|
||||
<description>the name of the shadow schema</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_db_name</name>
|
||||
<description>the target monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_db_shadow_name</name>
|
||||
<description>the name of the shadow monitor db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>observatory_db_name</name>
|
||||
<description>the target monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>observatory_db_shadow_name</name>
|
||||
<description>the name of the shadow monitor db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
|
||||
</property>-->
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<!-- <property>
|
||||
<name>hive_timeout</name>
|
||||
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
|
||||
</property>-->
|
||||
<!-- <property>
|
||||
<name>context_api_url</name>
|
||||
<description>the base url of the context api (https://services.openaire.eu/openaire)</description>
|
||||
</property>-->
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<!-- <property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>-->
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="Step1-createIndicatorsDB"/>
|
||||
<action name="Step1-createIndicatorsDB">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>indicators.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${indicators_db_name}</argument>
|
||||
<argument>${indicators_shadow_db_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/createIndicatorsTables.sql</argument>
|
||||
<file>scripts/indicators.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue