diff --git a/README.md b/README.md index 3637db8..6d88055 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,55 @@ Role Name ========= -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. +A role that installs the Prometheus alert manager, Role Variables -------------- -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. +The most important variables are listed below: + +``` yaml +prometheus_alert_m_install: true +prometheus_alert_m_version: 0.25.0 +# https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz +prometheus_alert_m_download_url: 'https://github.com/prometheus/alertmanager/releases/download/v{{ prometheus_alert_m_version }}/{{ prometheus_alert_m_file }}' +prometheus_alert_m_port: 9093 +prometheus_alert_m_behind_proxy: false +prometheus_alert_m_ext_url: "https://localhost" +prometheus_alert_m_loglevel: info +prometheus_alert_m_opts: "--log.level={{ prometheus_alert_m_loglevel }} --config.file={{ prometheus_alert_m_conf_file }} --storage.path={{ prometheus_alert_m_data_dir }} {{ prometheus_alert_m_cluster_opts }}" +# List the additional options here +prometheus_alert_m_additional_opts: '' +prometheus_alert_m_conf_dir: "/opt/prometheus/alertmanager/conf" +prometheus_alert_m_conf_file: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml" +prometheus_alert_m_data_dir: "/opt/prometheus/alertmanager/data" +prometheus_alert_m_firewalld_rules: 'enabled' +prometheus_alert_m_firewalld_ports: + - port: '{{ prometheus_alert_m_port }}' + protocol: 'tcp' + state: '{{ prometheus_alert_m_firewalld_rules }}' + zone: '{{ firewalld_default_zone }}' + +prometheus_alert_m_install_conf: false +prometheus_alert_m_smtp_smarthost: "localhost:25" +prometheus_alert_m_smtp_from: "alerts@localhost" +prometheus_alert_m_smtp_authenticated: false + +prometheus_alert_m_default_receiver: "global-alerts" +prometheus_alert_m_alerts_group_by: "['alertname', 'cluster']" + +prometheus_alert_m_cluster_enabled: false +prometheus_alert_m_cluster_port: 9094 +prometheus_alert_m_cluster_addr: 0.0.0.0 +prometheus_alert_m_cluster_peers: + - "localhost:{{ prometheus_alert_m_cluster_port }}" +prometheus_alert_m_cluster_opts: "{% if not prometheus_alert_m_cluster_enabled %} --cluster.listen-address ''{% else %} --cluster.listen-address '{{ prometheus_alert_m_cluster_addr }}:{{ prometheus_alert_m_cluster_port }}'{% for peer in prometheus_alert_m_cluster_peers %} --cluster.peer {{ peer }}{% endfor %}{% endif %}" +``` Dependencies ------------ -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } +None License ------- @@ -35,4 +59,4 @@ EUPL-1.2 Author Information ------------------ -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +Andrea Dell'Amico, diff --git a/defaults/main.yml b/defaults/main.yml index 95d3c70..cc13cf5 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,2 +1,44 @@ --- -# defaults file for ansible-role-template \ No newline at end of file +prometheus_alert_m_install: true +prometheus_alert_m_version: 0.25.0 +# https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz +prometheus_alert_m_download_url: 'https://github.com/prometheus/alertmanager/releases/download/v{{ prometheus_alert_m_version }}/{{ prometheus_alert_m_file }}' +prometheus_alert_m_listen: '{{ ansible_default_ipv4.address }}' +prometheus_alert_m_port: 9093 +prometheus_alert_m_behind_proxy: false +prometheus_alert_m_ext_url: "https://localhost" +prometheus_alert_m_loglevel: info +prometheus_alert_m_opts: "--web.listen-address={{ prometheus_alert_m_listen }}:{{ prometheus_alert_m_port }} --log.level={{ prometheus_alert_m_loglevel }} --config.file={{ prometheus_alert_m_conf_file }} --storage.path={{ prometheus_alert_m_data_dir }} {{ prometheus_alert_m_cluster_opts }}" +# List the additional options here +prometheus_alert_m_additional_opts: '' +prometheus_alert_m_conf_dir: "/opt/prometheus/alertmanager/conf" +prometheus_alert_m_conf_file: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml" +prometheus_alert_m_data_dir: "/opt/prometheus/alertmanager/data" +prometheus_alert_m_firewalld_rules: 'enabled' +prometheus_alert_m_firewalld_ports: + - port: '{{ prometheus_alert_m_port }}' + protocol: 'tcp' + state: '{{ prometheus_alert_m_firewalld_rules }}' + zone: '{{ firewalld_default_zone }}' + +prometheus_alert_m_install_conf: false +prometheus_alert_m_smtp_smarthost: "localhost:25" +prometheus_alert_m_smtp_from: "alerts@localhost" +prometheus_alert_m_smtp_authenticated: false + +prometheus_alert_m_default_receiver: "global-alerts" +prometheus_alert_m_alerts_group_by: "['alertname', 'cluster']" + +prometheus_alert_m_cluster_enabled: false +prometheus_alert_m_cluster_port: 9094 +prometheus_alert_m_cluster_addr: 0.0.0.0 +prometheus_alert_m_cluster_advertise: '{{ ansible_default_ipv4.address }}' +prometheus_alert_m_cluster_peers: + - "localhost:{{ prometheus_alert_m_cluster_port }}" +prometheus_alert_m_cluster_opts: "{% if not prometheus_alert_m_cluster_enabled %} --cluster.listen-address=''{% else %} --cluster.listen-address={{ prometheus_alert_m_cluster_addr }}:{{ prometheus_alert_m_cluster_port }} --cluster.advertise-address={{ prometheus_alert_m_cluster_advertise}}:{{prometheus_alert_m_cluster_port }}{% for peer in prometheus_alert_m_cluster_peers %} --cluster.peer={{ peer }}{% endfor %}{% endif %}" + +# Local Variables: +# eval: (ansible-doc-mode 1) +# eval: (add-to-list (quote company-backends) (quote company-ansible)) +# eval: (ansible 1) +# End: diff --git a/handlers/main.yml b/handlers/main.yml index 27474e0..f928b1a 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -1,2 +1,13 @@ --- -# handlers file for ansible-role-template \ No newline at end of file +- name: Restart alertmanager + service: + name: alertmanager + state: restarted + enabled: true + +- name: Reload alertmanager + service: + name: alertmanager + # state: reloaded + state: restarted + enabled: true diff --git a/meta/main.yml b/meta/main.yml index 81bda14..5565670 100644 --- a/meta/main.yml +++ b/meta/main.yml @@ -1,46 +1,29 @@ galaxy_info: - author: your name - description: your description - company: your company (optional) + author: Andrea Dell'Amico + description: Systems Architect + company: ISTI-CNR - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - issue_tracker_url: https://support.d4science.org/projects/automatic-provisioning/issues + issue_tracker_url: https://redmine-s2i2s.isti.cnr.it/projects/provisioning - license: EUPL-1.2 + license: EUPL 1.2+ min_ansible_version: 2.8 - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # Optionally specify the branch Galaxy will use when accessing the GitHub - # repo for this role. During role install, if no tags are available, - # Galaxy will use this branch. During import Galaxy will access files on - # this branch. If Travis integration is configured, only notifications for this - # branch will be accepted. Otherwise, in all cases, the repo's default branch - # (usually master) will be used. - #github_branch: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. # To view available platforms and versions (or releases), visit: # https://galaxy.ansible.com/api/v1/platforms/ # platforms: - - name: Ubuntu - versions: - - bionic + - name: Ubuntu + versions: + - trusty + - bionic + - name: EL + versions: + - 7 + - 8 - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. + galaxy_tags: + - monitoring + - metrics dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. \ No newline at end of file diff --git a/tasks/main.yml b/tasks/main.yml index 53c6cae..14a0303 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1,2 +1,157 @@ --- -# tasks file for ansible-role-template \ No newline at end of file +- name: Install the Prometheus alertmanager + tags: ['prometheus', 'prometheus_alertmanager'] + block: + - name: Create the user under the alertmanager will run + user: + name: "{{ prometheus_alert_m_user }}" + home: "{{ prometheus_alert_m_home }}" + createhome: false + shell: /usr/sbin/nologin + system: true + + - name: Create the prometheus alertmanager base directory + file: + dest: "{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - '{{ prometheus_alert_m_home }}' + - '{{ prometheus_alert_m_dist_dir }}' + + - name: Create the prometheus alertmanager distribution and data directories + file: + dest: "{{ item }}" + state: directory + owner: "{{ prometheus_alert_m_user }}" + group: "{{ prometheus_alert_m_user }}" + mode: 0700 + recurse: true + loop: + - "{{ prometheus_alert_m_home }}" + - "{{ prometheus_alert_m_data_dir }}" + + - name: Create the prometheus alertmanager config directory + file: + dest: "{{ item }}" + state: directory + owner: root + group: "{{ prometheus_alert_m_user }}" + mode: 0750 + recurse: true + loop: + - "{{ prometheus_alert_m_conf_dir }}" + - "{{ prometheus_alert_m_conf_dir }}/templates" + + - name: Download the prometheus alertmanager + get_url: + url: "{{ prometheus_alert_m_download_url }}" + dest: /srv/ + + - name: Unarchive the prometheus distribution + unarchive: + src: "/srv/{{ prometheus_alert_m_file }}" + dest: "{{ prometheus_alert_m_dist_dir }}" + remote_src: true + owner: root + group: root + args: + creates: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager" + register: alertmanager_download + notify: Restart alertmanager + + - name: Copy the binaries under /usr/local/bin + copy: + src: "{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/{{ item }}" + dest: "/usr/local/bin/{{ item }}" + remote_src: true + owner: root + group: root + mode: 0755 + loop: + - alertmanager + - amtool + +- name: Install the Prometheus alertmanager systemd unit + tags: + - prometheus + - prometheus_alertmanager + - alertmanager_conf + - prometheus_alertmanager_conf + block: + - name: Install the prometheus alertmanager systemd unit + template: + src: alertmanager.service.j2 + dest: /etc/systemd/system/alertmanager.service + mode: 0644 + owner: root + group: root + register: systemd_reload_required + notify: Restart alertmanager + + - name: Reload the systemd data + systemd: daemon_reload=yes + when: systemd_reload_required is changed + + - name: Ensure that prometheus prometheus_alertmanager is started and enabled + service: + name: alertmanager + state: started + enabled: true + +- name: Install the Prometheus alertmanager configuration + tags: + - prometheus + - prometheus_alertmanager + - alertmanager_conf + - prometheus_alertmanager_conf + when: prometheus_alert_m_install_conf + block: + - name: Install the prometheus alertmanager configuration file + template: + src: alertmanager.yml.j2 + dest: "{{ prometheus_alert_m_conf_dir }}/alertmanager.yml" + mode: 0640 + owner: root + group: "{{ prometheus_alert_m_user }}" + register: systemd_reload_required + notify: Reload alertmanager + +- name: Remove the alertnamager installation + tags: [ 'prometheus', 'prometheus_alertmanager' ] + when: not prometheus_alert_m_install + block: + - name: Ensure that prometheus prometheus_alertmanager is stopped and disabled + service: name=prometheus_alertmanager state=stopped enabled=no + ignore_errors: True + + - name: Remove prometheus alertmanager upstart script + file: dest=/etc/init/prometheus_alertmanager.conf state=absent + when: ansible_service_mgr != 'systemd' + + - name: Remove the prometheus alertmanager systemd unit + file: dest=/etc/systemd/system/prometheus_alertmanager.service state=absent + when: ansible_service_mgr == 'systemd' + register: systemd_reload_required + + - name: Reload the systemd data + systemd: daemon_reload=yes + when: systemd_reload_required is changed + +- name: Manage the prometheus alertmanager firewalld rules + when: + - ansible_distribution_file_variety == "RedHat" + - firewalld_enabled is defined and firewalld_enabled | bool + tags: [ 'prometheus', 'prometheus_alertmanager', 'firewall', 'firewalld', 'iptables', 'iptables_rules' ] + block: + - name: Manage the prometheus alertmanager firewalld ports + firewalld: port={{ item.port }}/{{ item.protocol }} zone={{ item.zone }} permanent={{ item.permanent | default(True) }} state={{ item.state }} immediate=True + with_items: '{{ prometheus_alert_m_firewalld_ports }}' + +# Local Variables: +# eval: (ansible-doc-mode 1) +# eval: (add-to-list (quote company-backends) (quote company-ansible)) +# eval: (ansible 1) +# End: diff --git a/templates/alertmanager.service.j2 b/templates/alertmanager.service.j2 new file mode 100644 index 0000000..0147a13 --- /dev/null +++ b/templates/alertmanager.service.j2 @@ -0,0 +1,15 @@ +[Unit] +Description=alertmanager - Prometheus alert manager. +After=network.target + +[Service] +Type=simple +Restart=on-failure +User={{ prometheus_alert_m_user }} +Group={{ prometheus_alert_m_user }} + +ExecStart={{ prometheus_alert_m_cmd }} {{ prometheus_alert_m_opts }} {{ prometheus_alert_m_additional_opts }} + +[Install] +WantedBy=multi-user.target +Alias=prometheus_alertmanager.service diff --git a/templates/alertmanager.yml.j2 b/templates/alertmanager.yml.j2 new file mode 100644 index 0000000..9e4cdb9 --- /dev/null +++ b/templates/alertmanager.yml.j2 @@ -0,0 +1,147 @@ +--- +global: + resolve_timeout: 1m + smtp_from: {{ prometheus_alert_m_smtp_from }} + smtp_smarthost: {{ prometheus_alert_m_smtp_smarthost }} +{% if prometheus_alert_m_smtp_authenticated %} + smtp_auth_username: + smtp_auth_password: +{% endif %} +# templates: +# - "{{ prometheus_alert_m_conf_dir }}/templates/*.tmpl" + # The root route must not have any matchers as it is the entry point for + # all alerts. It needs to have a receiver configured so alerts that do not + # match any of the sub-routes are sent to someone. + receiver: '{{ prometheus_alert_m_default_receiver }}' + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: {{ prometheus_alert_m_alerts_group_by }} + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # This routes performs a regular expression match on alert labels to + # catch alerts that are related to a list of services. + - match_re: + service: ^(foo1|foo2|baz)$ + receiver: team-X-mails + + # The service has a sub-route for critical alerts, any alerts + # that do not match, i.e. severity != critical, fall-back to the + # parent node and are sent to 'team-X-mails' + routes: + - match: + severity: critical + receiver: team-X-pager + + - match: + service: files + receiver: team-Y-mails + + routes: + - match: + severity: critical + receiver: team-Y-pager + + # This route handles all alerts coming from a database service. If there's + # no team to handle it, it defaults to the DB team. + - match: + service: database + + receiver: team-DB-pager + # Also group alerts by affected database. + group_by: [alertname, cluster, database] + + routes: + - match: + owner: team-X + receiver: team-X-pager + + - match: + owner: team-Y + receiver: team-Y-pager + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_matchers: + - severity="critical" + target_matchers: + - severity="warning" + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: ['alertname'] + + +receivers: +- name: 'team-X-mails' + email_configs: + - to: 'team-X+alerts@example.org, team-Y+alerts@example.org' + +- name: 'team-X-pager' + email_configs: + - to: 'team-X+alerts-critical@example.org' + pagerduty_configs: + - routing_key: + +- name: 'team-Y-mails' + email_configs: + - to: 'team-Y+alerts@example.org' + +- name: 'team-Y-pager' + pagerduty_configs: + - routing_key: + +- name: 'team-DB-pager' + pagerduty_configs: + - routing_key: + +# - name: 'mail-slack-receiver' +# slack_configs: +# - api_url: put your url here +# channel: 'put your channel name here' +# send_resolved: true +# icon_url: https://avatars3.githubusercontent.com/u/3380462 +# {% raw %} +# text: >- +# {{ range .Alerts -}} +# *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} *Description:* {{ .Annotations.description }} +# *Details:* +# {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` +# {{ end }} +# {{ end }} +# {% endraw %} +# email_configs: +# - to: 'emails of the ones that need to be notified' +# send_resolved: true diff --git a/vars/main.yml b/vars/main.yml index 3808477..3e7ba59 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -1,2 +1,12 @@ --- -# vars file for ansible-role-template \ No newline at end of file +prometheus_alert_m_dir: 'alertmanager-{{ prometheus_alert_m_version }}.linux-amd64' +prometheus_alert_m_file: '{{ prometheus_alert_m_dir }}.tar.gz' +prometheus_alert_m_user: prom_alert_m +prometheus_alert_m_home: /opt/prometheus_alertmanager +prometheus_alert_m_dist_dir: "{{ prometheus_alert_m_home }}" +prometheus_alert_m_cmd: '{{ prometheus_alert_m_dist_dir }}/{{ prometheus_alert_m_dir }}/alertmanager' +# Local Variables: +# eval: (ansible-doc-mode 1) +# eval: (add-to-list (quote company-backends) (quote company-ansible)) +# eval: (ansible 1) +# End: