diff --git a/etc/kayobe/inventory/group_vars/all/docker b/etc/kayobe/inventory/group_vars/all/docker new file mode 100644 index 0000000000..bc807e8527 --- /dev/null +++ b/etc/kayobe/inventory/group_vars/all/docker @@ -0,0 +1,3 @@ +--- +# Address for prometheus metrics endpoint +docker_metrics_addr: "{{ internal_net_name | net_ip + ':9323'}}" diff --git a/etc/kayobe/kolla/config/prometheus/docker.rules b/etc/kayobe/kolla/config/prometheus/docker.rules new file mode 100644 index 0000000000..0d197e8713 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/docker.rules @@ -0,0 +1,33 @@ + +{% raw %} + +groups: +- name: Docker + rules: + + - alert: DockerContainerStopped + expr: 'engine_daemon_container_states_containers{state="stopped"} > 0' + labels: + severity: warning + annotations: + summary: "Containers not running (instance {{ $labels.instance }})" + description: "One or more container are stopped" + + - alert: DockerContainerPaused + expr: 'engine_daemon_container_states_containers{state="paused"} > 0' + labels: + severity: warning + annotations: + summary: "Containers not running (instance {{ $labels.instance }})" + description: "One or more container are stopped" + + - alert: DockerContainerHealthCheckFail + expr: rate(engine_daemon_health_checks_failed_total[1m]) > 1 + labels: + severity: warning + annotations: + summary: "Containers health check failed (instance {{ $labels.instance }})" + description: "One or more container health checks failed" + +{% endraw %} + diff --git a/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml b/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml new file mode 100644 index 0000000000..62e45fdbeb --- /dev/null +++ b/releasenotes/notes/docker-alerts-30e8d870f25e500b.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Added new default alerting rules for containers being unhealthy or stopped.