Basic role that installs prometheus alertmanager.

This commit is contained in:
Andrea Dell'Amico 2023-01-10 20:40:49 +01:00
parent 9c44f5e486
commit c21c6c6c9a
Signed by: andrea.dellamico
GPG Key ID: 147ABE6CEB9E20FF
8 changed files with 442 additions and 55 deletions

View File

@ -1,31 +1,55 @@
Role Name
=========
A brief description of the role goes here.
Requirements
------------
Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required.
A role that installs the Prometheus alert manager, <https://github.com/prometheus/alertmanager>
Role Variables
--------------
A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well.
The most important variables are listed below:
``` yaml
prometheus_alert_m_install: true
prometheus_alert_m_version: 0.25.0
# https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
prometheus_alert_m_download_url: 'https://github.com/prometheus/alertmanager/releases/download/v{{ prometheus_alert_m_version }}/{{ prometheus_alert_m_file }}'
prometheus_alert_m_port: 9093
prometheus_alert_m_behind_proxy: false
prometheus_alert_m_ext_url: "https://localhost"
prometheus_alert_m_loglevel: info
prometheus_alert_m_opts: "--log.level={{ prometheus_alert_m_loglevel }} --config.file={{ prometheus_alert_m_conf_file }} --storage.path={{ prometheus_alert_m_data_dir }} {{ prometheus_alert_m_cluster_opts }}"
# List the additional options here
prometheus_alert_m_additional_opts: ''
prometheus_alert_m_conf_dir: "/opt/prometheus/alertmanager/conf"
prometheus_alert_m_conf_file: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml"
prometheus_alert_m_data_dir: "/opt/prometheus/alertmanager/data"
prometheus_alert_m_firewalld_rules: 'enabled'
prometheus_alert_m_firewalld_ports:
- port: '{{ prometheus_alert_m_port }}'
protocol: 'tcp'
state: '{{ prometheus_alert_m_firewalld_rules }}'
zone: '{{ firewalld_default_zone }}'
prometheus_alert_m_install_conf: false
prometheus_alert_m_smtp_smarthost: "localhost:25"
prometheus_alert_m_smtp_from: "alerts@localhost"
prometheus_alert_m_smtp_authenticated: false
prometheus_alert_m_default_receiver: "global-alerts"
prometheus_alert_m_alerts_group_by: "['alertname', 'cluster']"
prometheus_alert_m_cluster_enabled: false
prometheus_alert_m_cluster_port: 9094
prometheus_alert_m_cluster_addr: 0.0.0.0
prometheus_alert_m_cluster_peers:
- "localhost:{{ prometheus_alert_m_cluster_port }}"
prometheus_alert_m_cluster_opts: "{% if not prometheus_alert_m_cluster_enabled %} --cluster.listen-address ''{% else %} --cluster.listen-address '{{ prometheus_alert_m_cluster_addr }}:{{ prometheus_alert_m_cluster_port }}'{% for peer in prometheus_alert_m_cluster_peers %} --cluster.peer {{ peer }}{% endfor %}{% endif %}"
```
Dependencies
------------
A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles.
Example Playbook
----------------
Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too:
- hosts: servers
roles:
- { role: username.rolename, x: 42 }
None
License
-------
@ -35,4 +59,4 @@ EUPL-1.2
Author Information
------------------
An optional section for the role authors to include contact information, or a website (HTML is not allowed).
Andrea Dell'Amico, <andrea.dellamico@isti.cnr.it>

View File

@ -1,2 +1,44 @@
---
# defaults file for ansible-role-template
prometheus_alert_m_install: true
prometheus_alert_m_version: 0.25.0
# https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
prometheus_alert_m_download_url: 'https://github.com/prometheus/alertmanager/releases/download/v{{ prometheus_alert_m_version }}/{{ prometheus_alert_m_file }}'
prometheus_alert_m_listen: '{{ ansible_default_ipv4.address }}'
prometheus_alert_m_port: 9093
prometheus_alert_m_behind_proxy: false
prometheus_alert_m_ext_url: "https://localhost"
prometheus_alert_m_loglevel: info
prometheus_alert_m_opts: "--web.listen-address={{ prometheus_alert_m_listen }}:{{ prometheus_alert_m_port }} --log.level={{ prometheus_alert_m_loglevel }} --config.file={{ prometheus_alert_m_conf_file }} --storage.path={{ prometheus_alert_m_data_dir }} {{ prometheus_alert_m_cluster_opts }}"
# List the additional options here
prometheus_alert_m_additional_opts: ''
prometheus_alert_m_conf_dir: "/opt/prometheus/alertmanager/conf"
prometheus_alert_m_conf_file: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml"
prometheus_alert_m_data_dir: "/opt/prometheus/alertmanager/data"
prometheus_alert_m_firewalld_rules: 'enabled'
prometheus_alert_m_firewalld_ports:
- port: '{{ prometheus_alert_m_port }}'
protocol: 'tcp'
state: '{{ prometheus_alert_m_firewalld_rules }}'
zone: '{{ firewalld_default_zone }}'
prometheus_alert_m_install_conf: false
prometheus_alert_m_smtp_smarthost: "localhost:25"
prometheus_alert_m_smtp_from: "alerts@localhost"
prometheus_alert_m_smtp_authenticated: false
prometheus_alert_m_default_receiver: "global-alerts"
prometheus_alert_m_alerts_group_by: "['alertname', 'cluster']"
prometheus_alert_m_cluster_enabled: false
prometheus_alert_m_cluster_port: 9094
prometheus_alert_m_cluster_addr: 0.0.0.0
prometheus_alert_m_cluster_advertise: '{{ ansible_default_ipv4.address }}'
prometheus_alert_m_cluster_peers:
- "localhost:{{ prometheus_alert_m_cluster_port }}"
prometheus_alert_m_cluster_opts: "{% if not prometheus_alert_m_cluster_enabled %} --cluster.listen-address=''{% else %} --cluster.listen-address={{ prometheus_alert_m_cluster_addr }}:{{ prometheus_alert_m_cluster_port }} --cluster.advertise-address={{ prometheus_alert_m_cluster_advertise}}:{{prometheus_alert_m_cluster_port }}{% for peer in prometheus_alert_m_cluster_peers %} --cluster.peer={{ peer }}{% endfor %}{% endif %}"
# Local Variables:
# eval: (ansible-doc-mode 1)
# eval: (add-to-list (quote company-backends) (quote company-ansible))
# eval: (ansible 1)
# End:

View File

@ -1,2 +1,13 @@
---
# handlers file for ansible-role-template
- name: Restart alertmanager
service:
name: alertmanager
state: restarted
enabled: true
- name: Reload alertmanager
service:
name: alertmanager
# state: reloaded
state: restarted
enabled: true

View File

@ -1,46 +1,29 @@
galaxy_info:
author: your name
description: your description
company: your company (optional)
author: Andrea Dell'Amico
description: Systems Architect
company: ISTI-CNR
# If the issue tracker for your role is not on github, uncomment the
# next line and provide a value
issue_tracker_url: https://support.d4science.org/projects/automatic-provisioning/issues
issue_tracker_url: https://redmine-s2i2s.isti.cnr.it/projects/provisioning
license: EUPL-1.2
license: EUPL 1.2+
min_ansible_version: 2.8
# If this a Container Enabled role, provide the minimum Ansible Container version.
# min_ansible_container_version:
# Optionally specify the branch Galaxy will use when accessing the GitHub
# repo for this role. During role install, if no tags are available,
# Galaxy will use this branch. During import Galaxy will access files on
# this branch. If Travis integration is configured, only notifications for this
# branch will be accepted. Otherwise, in all cases, the repo's default branch
# (usually master) will be used.
#github_branch:
#
# Provide a list of supported platforms, and for each platform a list of versions.
# If you don't wish to enumerate all versions for a particular platform, use 'all'.
# To view available platforms and versions (or releases), visit:
# https://galaxy.ansible.com/api/v1/platforms/
#
platforms:
- name: Ubuntu
versions:
- bionic
- name: Ubuntu
versions:
- trusty
- bionic
- name: EL
versions:
- 7
- 8
galaxy_tags: []
# List tags for your role here, one per line. A tag is a keyword that describes
# and categorizes the role. Users find roles by searching for tags. Be sure to
# remove the '[]' above, if you add tags to this list.
#
# NOTE: A tag is limited to a single word comprised of alphanumeric characters.
# Maximum 20 tags per role.
galaxy_tags:
- monitoring
- metrics
dependencies: []
# List your role dependencies here, one per line. Be sure to remove the '[]' above,
# if you add dependencies to this list.

View File

@ -1,2 +1,157 @@
---
# tasks file for ansible-role-template
- name: Install the Prometheus alertmanager
tags: ['prometheus', 'prometheus_alertmanager']
block:
- name: Create the user under the alertmanager will run
user:
name: "{{ prometheus_alert_m_user }}"
home: "{{ prometheus_alert_m_home }}"
createhome: false
shell: /usr/sbin/nologin
system: true
- name: Create the prometheus alertmanager base directory
file:
dest: "{{ item }}"
state: directory
owner: root
group: root
mode: 0755
loop:
- '{{ prometheus_alert_m_home }}'
- '{{ prometheus_alert_m_dist_dir }}'
- name: Create the prometheus alertmanager distribution and data directories
file:
dest: "{{ item }}"
state: directory
owner: "{{ prometheus_alert_m_user }}"
group: "{{ prometheus_alert_m_user }}"
mode: 0700
recurse: true
loop:
- "{{ prometheus_alert_m_home }}"
- "{{ prometheus_alert_m_data_dir }}"
- name: Create the prometheus alertmanager config directory
file:
dest: "{{ item }}"
state: directory
owner: root
group: "{{ prometheus_alert_m_user }}"
mode: 0750
recurse: true
loop:
- "{{ prometheus_alert_m_conf_dir }}"
- "{{ prometheus_alert_m_conf_dir }}/templates"
- name: Download the prometheus alertmanager
get_url:
url: "{{ prometheus_alert_m_download_url }}"
dest: /srv/
- name: Unarchive the prometheus distribution
unarchive:
src: "/srv/{{ prometheus_alert_m_file }}"
dest: "{{ prometheus_alert_m_dist_dir }}"
remote_src: true
owner: root
group: root
args:
creates: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager"
register: alertmanager_download
notify: Restart alertmanager
- name: Copy the binaries under /usr/local/bin
copy:
src: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/{{ item }}"
dest: "/usr/local/bin/{{ item }}"
remote_src: true
owner: root
group: root
mode: 0755
loop:
- alertmanager
- amtool
- name: Install the Prometheus alertmanager systemd unit
tags:
- prometheus
- prometheus_alertmanager
- alertmanager_conf
- prometheus_alertmanager_conf
block:
- name: Install the prometheus alertmanager systemd unit
template:
src: alertmanager.service.j2
dest: /etc/systemd/system/alertmanager.service
mode: 0644
owner: root
group: root
register: systemd_reload_required
notify: Restart alertmanager
- name: Reload the systemd data
systemd: daemon_reload=yes
when: systemd_reload_required is changed
- name: Ensure that prometheus prometheus_alertmanager is started and enabled
service:
name: alertmanager
state: started
enabled: true
- name: Install the Prometheus alertmanager configuration
tags:
- prometheus
- prometheus_alertmanager
- alertmanager_conf
- prometheus_alertmanager_conf
when: prometheus_alert_m_install_conf
block:
- name: Install the prometheus alertmanager configuration file
template:
src: alertmanager.yml.j2
dest: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml"
mode: 0640
owner: root
group: "{{ prometheus_alert_m_user }}"
register: systemd_reload_required
notify: Reload alertmanager
- name: Remove the alertnamager installation
tags: [ 'prometheus', 'prometheus_alertmanager' ]
when: not prometheus_alert_m_install
block:
- name: Ensure that prometheus prometheus_alertmanager is stopped and disabled
service: name=prometheus_alertmanager state=stopped enabled=no
ignore_errors: True
- name: Remove prometheus alertmanager upstart script
file: dest=/etc/init/prometheus_alertmanager.conf state=absent
when: ansible_service_mgr != 'systemd'
- name: Remove the prometheus alertmanager systemd unit
file: dest=/etc/systemd/system/prometheus_alertmanager.service state=absent
when: ansible_service_mgr == 'systemd'
register: systemd_reload_required
- name: Reload the systemd data
systemd: daemon_reload=yes
when: systemd_reload_required is changed
- name: Manage the prometheus alertmanager firewalld rules
when:
- ansible_distribution_file_variety == "RedHat"
- firewalld_enabled is defined and firewalld_enabled | bool
tags: [ 'prometheus', 'prometheus_alertmanager', 'firewall', 'firewalld', 'iptables', 'iptables_rules' ]
block:
- name: Manage the prometheus alertmanager firewalld ports
firewalld: port={{ item.port }}/{{ item.protocol }} zone={{ item.zone }} permanent={{ item.permanent | default(True) }} state={{ item.state }} immediate=True
with_items: '{{ prometheus_alert_m_firewalld_ports }}'
# Local Variables:
# eval: (ansible-doc-mode 1)
# eval: (add-to-list (quote company-backends) (quote company-ansible))
# eval: (ansible 1)
# End:

View File

@ -0,0 +1,15 @@
[Unit]
Description=alertmanager - Prometheus alert manager.
After=network.target
[Service]
Type=simple
Restart=on-failure
User={{ prometheus_alert_m_user }}
Group={{ prometheus_alert_m_user }}
ExecStart={{ prometheus_alert_m_cmd }} {{ prometheus_alert_m_opts }} {{ prometheus_alert_m_additional_opts }}
[Install]
WantedBy=multi-user.target
Alias=prometheus_alertmanager.service

View File

@ -0,0 +1,147 @@
---
global:
resolve_timeout: 1m
smtp_from: {{ prometheus_alert_m_smtp_from }}
smtp_smarthost: {{ prometheus_alert_m_smtp_smarthost }}
{% if prometheus_alert_m_smtp_authenticated %}
smtp_auth_username:
smtp_auth_password:
{% endif %}
# templates:
# - "{{ prometheus_alert_m_conf_dir }}/templates/*.tmpl"
# The root route must not have any matchers as it is the entry point for
# all alerts. It needs to have a receiver configured so alerts that do not
# match any of the sub-routes are sent to someone.
receiver: '{{ prometheus_alert_m_default_receiver }}'
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: {{ prometheus_alert_m_alerts_group_by }}
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h
# All the above attributes are inherited by all child routes and can
# overwritten on each.
# The child route trees.
routes:
# This routes performs a regular expression match on alert labels to
# catch alerts that are related to a list of services.
- match_re:
service: ^(foo1|foo2|baz)$
receiver: team-X-mails
# The service has a sub-route for critical alerts, any alerts
# that do not match, i.e. severity != critical, fall-back to the
# parent node and are sent to 'team-X-mails'
routes:
- match:
severity: critical
receiver: team-X-pager
- match:
service: files
receiver: team-Y-mails
routes:
- match:
severity: critical
receiver: team-Y-pager
# This route handles all alerts coming from a database service. If there's
# no team to handle it, it defaults to the DB team.
- match:
service: database
receiver: team-DB-pager
# Also group alerts by affected database.
group_by: [alertname, cluster, database]
routes:
- match:
owner: team-X
receiver: team-X-pager
- match:
owner: team-Y
receiver: team-Y-pager
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: ['alertname']
receivers:
- name: 'team-X-mails'
email_configs:
- to: 'team-X+alerts@example.org, team-Y+alerts@example.org'
- name: 'team-X-pager'
email_configs:
- to: 'team-X+alerts-critical@example.org'
pagerduty_configs:
- routing_key: <team-X-key>
- name: 'team-Y-mails'
email_configs:
- to: 'team-Y+alerts@example.org'
- name: 'team-Y-pager'
pagerduty_configs:
- routing_key: <team-Y-key>
- name: 'team-DB-pager'
pagerduty_configs:
- routing_key: <team-DB-key>
# - name: 'mail-slack-receiver'
# slack_configs:
# - api_url: put your url here
# channel: 'put your channel name here'
# send_resolved: true
# icon_url: https://avatars3.githubusercontent.com/u/3380462
# {% raw %}
# text: >-
# {{ range .Alerts -}}
# *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} *Description:* {{ .Annotations.description }}
# *Details:*
# {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
# {{ end }}
# {{ end }}
# {% endraw %}
# email_configs:
# - to: 'emails of the ones that need to be notified'
# send_resolved: true

View File

@ -1,2 +1,12 @@
---
# vars file for ansible-role-template
prometheus_alert_m_dir: 'alertmanager-{{ prometheus_alert_m_version }}.linux-amd64'
prometheus_alert_m_file: '{{ prometheus_alert_m_dir }}.tar.gz'
prometheus_alert_m_user: prom_alert_m
prometheus_alert_m_home: /opt/prometheus_alertmanager
prometheus_alert_m_dist_dir: "{{ prometheus_alert_m_home }}"
prometheus_alert_m_cmd: '{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager'
# Local Variables:
# eval: (ansible-doc-mode 1)
# eval: (add-to-list (quote company-backends) (quote company-ansible))
# eval: (ansible 1)
# End: