From 5cf8473575f3c97d2a2e160e20171e4d8655b872 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Fri, 24 Nov 2023 09:55:11 +0000 Subject: [PATCH] Add alertmanager rules for Docker Docker has a builtin prometheus exporter that we currently don't have enabled. This change adds alerts for stopped/paused containers and failed healthchecks. This patch requires changes to docker's configuration to export the metrics, and prometheus to consume them. This means that Kayobe and Kolla-Ansible should both be updated to their latest versions. --- etc/kayobe/inventory/group_vars/all/docker | 3 ++ .../kolla/config/prometheus/docker.rules | 33 +++++++++++++++++++ .../notes/docker-alerts-30e8d870f25e500b.yaml | 4 +++ 3 files changed, 40 insertions(+) create mode 100644 etc/kayobe/inventory/group_vars/all/docker create mode 100644 etc/kayobe/kolla/config/prometheus/docker.rules create mode 100644 releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml diff --git a/etc/kayobe/inventory/group_vars/all/docker b/etc/kayobe/inventory/group_vars/all/docker new file mode 100644 index 0000000000..bc807e8527 --- /dev/null +++ b/etc/kayobe/inventory/group_vars/all/docker @@ -0,0 +1,3 @@ +--- +# Address for prometheus metrics endpoint +docker_metrics_addr: "{{ internal_net_name | net_ip + ':9323'}}" diff --git a/etc/kayobe/kolla/config/prometheus/docker.rules b/etc/kayobe/kolla/config/prometheus/docker.rules new file mode 100644 index 0000000000..0d197e8713 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/docker.rules @@ -0,0 +1,33 @@ + +{% raw %} + +groups: +- name: Docker + rules: + + - alert: DockerContainerStopped + expr: 'engine_daemon_container_states_containers{state="stopped"} > 0' + labels: + severity: warning + annotations: + summary: "Containers not running (instance {{ $labels.instance }})" + description: "One or more container are stopped" + + - alert: DockerContainerPaused + expr: 'engine_daemon_container_states_containers{state="paused"} > 0' + labels: + severity: warning + annotations: + summary: "Containers not running (instance {{ $labels.instance }})" + description: "One or more container are stopped" + + - alert: DockerContainerHealthCheckFail + expr: rate(engine_daemon_health_checks_failed_total[1m]) > 1 + labels: + severity: warning + annotations: + summary: "Containers health check failed (instance {{ $labels.instance }})" + description: "One or more container health checks failed" + +{% endraw %} + diff --git a/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml b/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml new file mode 100644 index 0000000000..62e45fdbeb --- /dev/null +++ b/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Added new default alerting rules for containers being unhealthy or stopped.