Basic role that installs prometheus alertmanager.
parent
9c44f5e486
commit
c21c6c6c9a
@ -1,2 +1,44 @@
|
||||
---
|
||||
# defaults file for ansible-role-template
|
||||
prometheus_alert_m_install: true
|
||||
prometheus_alert_m_version: 0.25.0
|
||||
# https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
|
||||
prometheus_alert_m_download_url: 'https://github.com/prometheus/alertmanager/releases/download/v{{ prometheus_alert_m_version }}/{{ prometheus_alert_m_file }}'
|
||||
prometheus_alert_m_listen: '{{ ansible_default_ipv4.address }}'
|
||||
prometheus_alert_m_port: 9093
|
||||
prometheus_alert_m_behind_proxy: false
|
||||
prometheus_alert_m_ext_url: "https://localhost"
|
||||
prometheus_alert_m_loglevel: info
|
||||
prometheus_alert_m_opts: "--web.listen-address={{ prometheus_alert_m_listen }}:{{ prometheus_alert_m_port }} --log.level={{ prometheus_alert_m_loglevel }} --config.file={{ prometheus_alert_m_conf_file }} --storage.path={{ prometheus_alert_m_data_dir }} {{ prometheus_alert_m_cluster_opts }}"
|
||||
# List the additional options here
|
||||
prometheus_alert_m_additional_opts: ''
|
||||
prometheus_alert_m_conf_dir: "/opt/prometheus/alertmanager/conf"
|
||||
prometheus_alert_m_conf_file: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml"
|
||||
prometheus_alert_m_data_dir: "/opt/prometheus/alertmanager/data"
|
||||
prometheus_alert_m_firewalld_rules: 'enabled'
|
||||
prometheus_alert_m_firewalld_ports:
|
||||
- port: '{{ prometheus_alert_m_port }}'
|
||||
protocol: 'tcp'
|
||||
state: '{{ prometheus_alert_m_firewalld_rules }}'
|
||||
zone: '{{ firewalld_default_zone }}'
|
||||
|
||||
prometheus_alert_m_install_conf: false
|
||||
prometheus_alert_m_smtp_smarthost: "localhost:25"
|
||||
prometheus_alert_m_smtp_from: "alerts@localhost"
|
||||
prometheus_alert_m_smtp_authenticated: false
|
||||
|
||||
prometheus_alert_m_default_receiver: "global-alerts"
|
||||
prometheus_alert_m_alerts_group_by: "['alertname', 'cluster']"
|
||||
|
||||
prometheus_alert_m_cluster_enabled: false
|
||||
prometheus_alert_m_cluster_port: 9094
|
||||
prometheus_alert_m_cluster_addr: 0.0.0.0
|
||||
prometheus_alert_m_cluster_advertise: '{{ ansible_default_ipv4.address }}'
|
||||
prometheus_alert_m_cluster_peers:
|
||||
- "localhost:{{ prometheus_alert_m_cluster_port }}"
|
||||
prometheus_alert_m_cluster_opts: "{% if not prometheus_alert_m_cluster_enabled %} --cluster.listen-address=''{% else %} --cluster.listen-address={{ prometheus_alert_m_cluster_addr }}:{{ prometheus_alert_m_cluster_port }} --cluster.advertise-address={{ prometheus_alert_m_cluster_advertise}}:{{prometheus_alert_m_cluster_port }}{% for peer in prometheus_alert_m_cluster_peers %} --cluster.peer={{ peer }}{% endfor %}{% endif %}"
|
||||
|
||||
# Local Variables:
|
||||
# eval: (ansible-doc-mode 1)
|
||||
# eval: (add-to-list (quote company-backends) (quote company-ansible))
|
||||
# eval: (ansible 1)
|
||||
# End:
|
||||
|
@ -1,2 +1,13 @@
|
||||
---
|
||||
# handlers file for ansible-role-template
|
||||
- name: Restart alertmanager
|
||||
service:
|
||||
name: alertmanager
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Reload alertmanager
|
||||
service:
|
||||
name: alertmanager
|
||||
# state: reloaded
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
@ -1,46 +1,29 @@
|
||||
galaxy_info:
|
||||
author: your name
|
||||
description: your description
|
||||
company: your company (optional)
|
||||
author: Andrea Dell'Amico
|
||||
description: Systems Architect
|
||||
company: ISTI-CNR
|
||||
|
||||
# If the issue tracker for your role is not on github, uncomment the
|
||||
# next line and provide a value
|
||||
issue_tracker_url: https://support.d4science.org/projects/automatic-provisioning/issues
|
||||
issue_tracker_url: https://redmine-s2i2s.isti.cnr.it/projects/provisioning
|
||||
|
||||
license: EUPL-1.2
|
||||
license: EUPL 1.2+
|
||||
|
||||
min_ansible_version: 2.8
|
||||
|
||||
# If this a Container Enabled role, provide the minimum Ansible Container version.
|
||||
# min_ansible_container_version:
|
||||
|
||||
# Optionally specify the branch Galaxy will use when accessing the GitHub
|
||||
# repo for this role. During role install, if no tags are available,
|
||||
# Galaxy will use this branch. During import Galaxy will access files on
|
||||
# this branch. If Travis integration is configured, only notifications for this
|
||||
# branch will be accepted. Otherwise, in all cases, the repo's default branch
|
||||
# (usually master) will be used.
|
||||
#github_branch:
|
||||
|
||||
#
|
||||
# Provide a list of supported platforms, and for each platform a list of versions.
|
||||
# If you don't wish to enumerate all versions for a particular platform, use 'all'.
|
||||
# To view available platforms and versions (or releases), visit:
|
||||
# https://galaxy.ansible.com/api/v1/platforms/
|
||||
#
|
||||
platforms:
|
||||
- name: Ubuntu
|
||||
versions:
|
||||
- bionic
|
||||
|
||||
galaxy_tags: []
|
||||
# List tags for your role here, one per line. A tag is a keyword that describes
|
||||
# and categorizes the role. Users find roles by searching for tags. Be sure to
|
||||
# remove the '[]' above, if you add tags to this list.
|
||||
#
|
||||
# NOTE: A tag is limited to a single word comprised of alphanumeric characters.
|
||||
# Maximum 20 tags per role.
|
||||
- name: Ubuntu
|
||||
versions:
|
||||
- trusty
|
||||
- bionic
|
||||
- name: EL
|
||||
versions:
|
||||
- 7
|
||||
- 8
|
||||
|
||||
galaxy_tags:
|
||||
- monitoring
|
||||
- metrics
|
||||
|
||||
dependencies: []
|
||||
# List your role dependencies here, one per line. Be sure to remove the '[]' above,
|
||||
# if you add dependencies to this list.
|
@ -1,2 +1,157 @@
|
||||
---
|
||||
# tasks file for ansible-role-template
|
||||
- name: Install the Prometheus alertmanager
|
||||
tags: ['prometheus', 'prometheus_alertmanager']
|
||||
block:
|
||||
- name: Create the user under the alertmanager will run
|
||||
user:
|
||||
name: "{{ prometheus_alert_m_user }}"
|
||||
home: "{{ prometheus_alert_m_home }}"
|
||||
createhome: false
|
||||
shell: /usr/sbin/nologin
|
||||
system: true
|
||||
|
||||
- name: Create the prometheus alertmanager base directory
|
||||
file:
|
||||
dest: "{{ item }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: 0755
|
||||
loop:
|
||||
- '{{ prometheus_alert_m_home }}'
|
||||
- '{{ prometheus_alert_m_dist_dir }}'
|
||||
|
||||
- name: Create the prometheus alertmanager distribution and data directories
|
||||
file:
|
||||
dest: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ prometheus_alert_m_user }}"
|
||||
group: "{{ prometheus_alert_m_user }}"
|
||||
mode: 0700
|
||||
recurse: true
|
||||
loop:
|
||||
- "{{ prometheus_alert_m_home }}"
|
||||
- "{{ prometheus_alert_m_data_dir }}"
|
||||
|
||||
- name: Create the prometheus alertmanager config directory
|
||||
file:
|
||||
dest: "{{ item }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: "{{ prometheus_alert_m_user }}"
|
||||
mode: 0750
|
||||
recurse: true
|
||||
loop:
|
||||
- "{{ prometheus_alert_m_conf_dir }}"
|
||||
- "{{ prometheus_alert_m_conf_dir }}/templates"
|
||||
|
||||
- name: Download the prometheus alertmanager
|
||||
get_url:
|
||||
url: "{{ prometheus_alert_m_download_url }}"
|
||||
dest: /srv/
|
||||
|
||||
- name: Unarchive the prometheus distribution
|
||||
unarchive:
|
||||
src: "/srv/{{ prometheus_alert_m_file }}"
|
||||
dest: "{{ prometheus_alert_m_dist_dir }}"
|
||||
remote_src: true
|
||||
owner: root
|
||||
group: root
|
||||
args:
|
||||
creates: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager"
|
||||
register: alertmanager_download
|
||||
notify: Restart alertmanager
|
||||
|
||||
- name: Copy the binaries under /usr/local/bin
|
||||
copy:
|
||||
src: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/{{ item }}"
|
||||
dest: "/usr/local/bin/{{ item }}"
|
||||
remote_src: true
|
||||
owner: root
|
||||
group: root
|
||||
mode: 0755
|
||||
loop:
|
||||
- alertmanager
|
||||
- amtool
|
||||
|
||||
- name: Install the Prometheus alertmanager systemd unit
|
||||
tags:
|
||||
- prometheus
|
||||
- prometheus_alertmanager
|
||||
- alertmanager_conf
|
||||
- prometheus_alertmanager_conf
|
||||
block:
|
||||
- name: Install the prometheus alertmanager systemd unit
|
||||
template:
|
||||
src: alertmanager.service.j2
|
||||
dest: /etc/systemd/system/alertmanager.service
|
||||
mode: 0644
|
||||
owner: root
|
||||
group: root
|
||||
register: systemd_reload_required
|
||||
notify: Restart alertmanager
|
||||
|
||||
- name: Reload the systemd data
|
||||
systemd: daemon_reload=yes
|
||||
when: systemd_reload_required is changed
|
||||
|
||||
- name: Ensure that prometheus prometheus_alertmanager is started and enabled
|
||||
service:
|
||||
name: alertmanager
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Install the Prometheus alertmanager configuration
|
||||
tags:
|
||||
- prometheus
|
||||
- prometheus_alertmanager
|
||||
- alertmanager_conf
|
||||
- prometheus_alertmanager_conf
|
||||
when: prometheus_alert_m_install_conf
|
||||
block:
|
||||
- name: Install the prometheus alertmanager configuration file
|
||||
template:
|
||||
src: alertmanager.yml.j2
|
||||
dest: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml"
|
||||
mode: 0640
|
||||
owner: root
|
||||
group: "{{ prometheus_alert_m_user }}"
|
||||
register: systemd_reload_required
|
||||
notify: Reload alertmanager
|
||||
|
||||
- name: Remove the alertnamager installation
|
||||
tags: [ 'prometheus', 'prometheus_alertmanager' ]
|
||||
when: not prometheus_alert_m_install
|
||||
block:
|
||||
- name: Ensure that prometheus prometheus_alertmanager is stopped and disabled
|
||||
service: name=prometheus_alertmanager state=stopped enabled=no
|
||||
ignore_errors: True
|
||||
|
||||
- name: Remove prometheus alertmanager upstart script
|
||||
file: dest=/etc/init/prometheus_alertmanager.conf state=absent
|
||||
when: ansible_service_mgr != 'systemd'
|
||||
|
||||
- name: Remove the prometheus alertmanager systemd unit
|
||||
file: dest=/etc/systemd/system/prometheus_alertmanager.service state=absent
|
||||
when: ansible_service_mgr == 'systemd'
|
||||
register: systemd_reload_required
|
||||
|
||||
- name: Reload the systemd data
|
||||
systemd: daemon_reload=yes
|
||||
when: systemd_reload_required is changed
|
||||
|
||||
- name: Manage the prometheus alertmanager firewalld rules
|
||||
when:
|
||||
- ansible_distribution_file_variety == "RedHat"
|
||||
- firewalld_enabled is defined and firewalld_enabled | bool
|
||||
tags: [ 'prometheus', 'prometheus_alertmanager', 'firewall', 'firewalld', 'iptables', 'iptables_rules' ]
|
||||
block:
|
||||
- name: Manage the prometheus alertmanager firewalld ports
|
||||
firewalld: port={{ item.port }}/{{ item.protocol }} zone={{ item.zone }} permanent={{ item.permanent | default(True) }} state={{ item.state }} immediate=True
|
||||
with_items: '{{ prometheus_alert_m_firewalld_ports }}'
|
||||
|
||||
# Local Variables:
|
||||
# eval: (ansible-doc-mode 1)
|
||||
# eval: (add-to-list (quote company-backends) (quote company-ansible))
|
||||
# eval: (ansible 1)
|
||||
# End:
|
||||
|
@ -0,0 +1,15 @@
|
||||
[Unit]
|
||||
Description=alertmanager - Prometheus alert manager.
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
Restart=on-failure
|
||||
User={{ prometheus_alert_m_user }}
|
||||
Group={{ prometheus_alert_m_user }}
|
||||
|
||||
ExecStart={{ prometheus_alert_m_cmd }} {{ prometheus_alert_m_opts }} {{ prometheus_alert_m_additional_opts }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Alias=prometheus_alertmanager.service
|
@ -0,0 +1,147 @@
|
||||
---
|
||||
global:
|
||||
resolve_timeout: 1m
|
||||
smtp_from: {{ prometheus_alert_m_smtp_from }}
|
||||
smtp_smarthost: {{ prometheus_alert_m_smtp_smarthost }}
|
||||
{% if prometheus_alert_m_smtp_authenticated %}
|
||||
smtp_auth_username:
|
||||
smtp_auth_password:
|
||||
{% endif %}
|
||||
# templates:
|
||||
# - "{{ prometheus_alert_m_conf_dir }}/templates/*.tmpl"
|
||||
# The root route must not have any matchers as it is the entry point for
|
||||
# all alerts. It needs to have a receiver configured so alerts that do not
|
||||
# match any of the sub-routes are sent to someone.
|
||||
receiver: '{{ prometheus_alert_m_default_receiver }}'
|
||||
|
||||
# The labels by which incoming alerts are grouped together. For example,
|
||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||
# be batched into a single group.
|
||||
#
|
||||
# To aggregate by all possible labels use '...' as the sole label name.
|
||||
# This effectively disables aggregation entirely, passing through all
|
||||
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||
# a very low alert volume or your upstream notification system performs
|
||||
# its own grouping. Example: group_by: [...]
|
||||
group_by: {{ prometheus_alert_m_alerts_group_by }}
|
||||
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
# least 'group_wait' to send the initial notification.
|
||||
# This way ensures that you get multiple alerts for the same group that start
|
||||
# firing shortly after another are batched together on the first
|
||||
# notification.
|
||||
group_wait: 30s
|
||||
|
||||
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||
# of new alerts that started firing for that group.
|
||||
group_interval: 5m
|
||||
|
||||
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||
# resend them.
|
||||
repeat_interval: 3h
|
||||
|
||||
# All the above attributes are inherited by all child routes and can
|
||||
# overwritten on each.
|
||||
|
||||
# The child route trees.
|
||||
routes:
|
||||
# This routes performs a regular expression match on alert labels to
|
||||
# catch alerts that are related to a list of services.
|
||||
- match_re:
|
||||
service: ^(foo1|foo2|baz)$
|
||||
receiver: team-X-mails
|
||||
|
||||
# The service has a sub-route for critical alerts, any alerts
|
||||
# that do not match, i.e. severity != critical, fall-back to the
|
||||
# parent node and are sent to 'team-X-mails'
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: team-X-pager
|
||||
|
||||
- match:
|
||||
service: files
|
||||
receiver: team-Y-mails
|
||||
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: team-Y-pager
|
||||
|
||||
# This route handles all alerts coming from a database service. If there's
|
||||
# no team to handle it, it defaults to the DB team.
|
||||
- match:
|
||||
service: database
|
||||
|
||||
receiver: team-DB-pager
|
||||
# Also group alerts by affected database.
|
||||
group_by: [alertname, cluster, database]
|
||||
|
||||
routes:
|
||||
- match:
|
||||
owner: team-X
|
||||
receiver: team-X-pager
|
||||
|
||||
- match:
|
||||
owner: team-Y
|
||||
receiver: team-Y-pager
|
||||
|
||||
|
||||
# Inhibition rules allow to mute a set of alerts given that another alert is
|
||||
# firing.
|
||||
# We use this to mute any warning-level notifications if the same alert is
|
||||
# already critical.
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity="critical"
|
||||
target_matchers:
|
||||
- severity="warning"
|
||||
# Apply inhibition if the alertname is the same.
|
||||
# CAUTION:
|
||||
# If all label names listed in `equal` are missing
|
||||
# from both the source and target alerts,
|
||||
# the inhibition rule will apply!
|
||||
equal: ['alertname']
|
||||
|
||||
|
||||
receivers:
|
||||
- name: 'team-X-mails'
|
||||
email_configs:
|
||||
- to: 'team-X+alerts@example.org, team-Y+alerts@example.org'
|
||||
|
||||
- name: 'team-X-pager'
|
||||
email_configs:
|
||||
- to: 'team-X+alerts-critical@example.org'
|
||||
pagerduty_configs:
|
||||
- routing_key: <team-X-key>
|
||||
|
||||
- name: 'team-Y-mails'
|
||||
email_configs:
|
||||
- to: 'team-Y+alerts@example.org'
|
||||
|
||||
- name: 'team-Y-pager'
|
||||
pagerduty_configs:
|
||||
- routing_key: <team-Y-key>
|
||||
|
||||
- name: 'team-DB-pager'
|
||||
pagerduty_configs:
|
||||
- routing_key: <team-DB-key>
|
||||
|
||||
# - name: 'mail-slack-receiver'
|
||||
# slack_configs:
|
||||
# - api_url: put your url here
|
||||
# channel: 'put your channel name here'
|
||||
# send_resolved: true
|
||||
# icon_url: https://avatars3.githubusercontent.com/u/3380462
|
||||
# {% raw %}
|
||||
# text: >-
|
||||
# {{ range .Alerts -}}
|
||||
# *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} *Description:* {{ .Annotations.description }}
|
||||
# *Details:*
|
||||
# {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
|
||||
# {{ end }}
|
||||
# {{ end }}
|
||||
# {% endraw %}
|
||||
# email_configs:
|
||||
# - to: 'emails of the ones that need to be notified'
|
||||
# send_resolved: true
|
@ -1,2 +1,12 @@
|
||||
---
|
||||
# vars file for ansible-role-template
|
||||
prometheus_alert_m_dir: 'alertmanager-{{ prometheus_alert_m_version }}.linux-amd64'
|
||||
prometheus_alert_m_file: '{{ prometheus_alert_m_dir }}.tar.gz'
|
||||
prometheus_alert_m_user: prom_alert_m
|
||||
prometheus_alert_m_home: /opt/prometheus_alertmanager
|
||||
prometheus_alert_m_dist_dir: "{{ prometheus_alert_m_home }}"
|
||||
prometheus_alert_m_cmd: '{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager'
|
||||
# Local Variables:
|
||||
# eval: (ansible-doc-mode 1)
|
||||
# eval: (add-to-list (quote company-backends) (quote company-ansible))
|
||||
# eval: (ansible 1)
|
||||
# End:
|
||||
|
Loading…
Reference in New Issue