From e506d8f4fc41d0fc547abad7659526cecf591d7c Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 17 Mar 2026 09:51:28 +0100 Subject: [PATCH 1/4] Add calico bird down alert --- charts/controlplane-operations/Chart.yaml | 2 +- .../alerts/controlplane-gardener.yaml | 17 ++++++++++++++++- .../plugindefinition.yaml | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index ac1dc37..8752b95 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.27 +version: 1.0.28 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 78f17f9..3051507 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -7,7 +7,7 @@ groups: for: {{ dig "ShootUnavailability" "for" "10m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} - severity: {{ dig "ShootUnavailability" "severity" "info" .Values.prometheusRules }} + severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} @@ -15,3 +15,18 @@ groups: description: Shoot cluster is unavailable for more than 10 minutes. summary: Shoot cluster is unavailable for more than 10 minutes. {{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoBirdDown | default false) }} + - alert: CalicoBirdDown + expr: bird_protocol_up{import_filter="ACCEPT",ip_version="4",proto="BGP",state!="Passive"} == 0 + for: {{ dig "CalicoBirdDown" "for" "5m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoBirdDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBirdDown.md + service: {{ dig "CalicoBirdDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoBirdDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Calico Bird is down for more than 5 minutes. + summary: Calico Bird is down for more than 5 minutes. +{{- end }} diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index d205df4..71fb4f0 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.27 + version: 1.0.28 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.27 + version: 1.0.28 options: - name: prometheusRules.create description: Create Prometheus rules From 8c8dcd4a089ffe369de3cadcff7122fa1502ee79 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 17 Mar 2026 10:54:24 +0100 Subject: [PATCH 2/4] Calico alerts from upstream --- .../alerts/controlplane-gardener.yaml | 76 +++++++++++++++---- charts/controlplane-operations/values.yaml | 3 + 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 3051507..e4cd820 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -1,10 +1,13 @@ groups: - name: controlplane-gardener rules: + +### Gardener ### + {{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - alert: ShootUnavailability expr: shoot:availability == 0 - for: {{ dig "ShootUnavailability" "for" "10m" .Values.prometheusRules }} + for: {{ dig "ShootUnavailability" "for" "5m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }} @@ -12,21 +15,68 @@ groups: service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot cluster is unavailable for more than 10 minutes. - summary: Shoot cluster is unavailable for more than 10 minutes. + description: Shoot cluster(s) unavailability for more than 5 minutes. + summary: Shoot cluster(s) unavailability for more than 5 minutes. +{{- end }} + +### Calico ### + +{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }} + - alert: CalicoBgpNeighborSessionDown + expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} + for: {{ dig "CalicoBgpNeighborSessionDown" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoBgpNeighborSessionDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionDown.md + service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? + summary: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }} + - alert: CalicoBgpNeighborSessionAllDown + expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) == 0 + for: {{ dig "CalicoBgpNeighborSessionAllDown" "for" "10m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoBgpNeighborSessionAllDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionAllDown.md + service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Node has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? + summary: Node has no BGP neighbors. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }} + - alert: CalicoNodeMissing + expr: count(up{job="kube-kubelet"} == 1) > count(_pod_to_daemonset{owner_name="calico-node"} == 1) + for: {{ dig "CalicoNodeMissing" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoNodeMissing" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeMissing.md + service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Calico is not running on all nodes. + summary: Calico is not running on all bare metal nodes. Network datapath threatened! {{- end }} -{{- if not (.Values.prometheusRules.disabled.CalicoBirdDown | default false) }} - - alert: CalicoBirdDown - expr: bird_protocol_up{import_filter="ACCEPT",ip_version="4",proto="BGP",state!="Passive"} == 0 - for: {{ dig "CalicoBirdDown" "for" "5m" .Values.prometheusRules }} +{{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }} + - alert: CalicoNodeNotReady + expr: kube_pod_status_ready{namespace="kube-system",pod=~"calico-node-[a-zA-Z0-9]{5}",condition="false",type="shoot"} == 1 + for: {{ dig "CalicoNodeNotReady" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} - severity: {{ dig "CalicoBirdDown" "severity" "warning" .Values.prometheusRules }} - playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBirdDown.md - service: {{ dig "CalicoBirdDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} - support_group: {{ dig "CalicoBirdDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + severity: {{ dig "CalicoNodeNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeNotReady.md + service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico Bird is down for more than 5 minutes. - summary: Calico Bird is down for more than 5 minutes. + description: Calico-Node Pod is not Ready on all nodes. + summary: Calico-Node is not healthy on all bare metal nodes that are Ready. Risk of stale BGP advertisement. Network datapath threatened! {{- end }} diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index 64ca625..d5ad323 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -47,6 +47,9 @@ prometheusRules: # KubernetesApiServerDown: true # KubeletDown: true + calico: + bgpNeighborCount: 2 + # ServerStuckInDiscovery: # service: "metal-api" # supportGroup: "foundation" From 435eb4fc928c00dc0b41f2f653a6d600ac735889 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 17 Mar 2026 10:57:42 +0100 Subject: [PATCH 3/4] Better description summary for shoot unavailability. --- .../controlplane-operations/alerts/controlplane-gardener.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index e4cd820..9f0ba8b 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -15,8 +15,8 @@ groups: service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot cluster(s) unavailability for more than 5 minutes. - summary: Shoot cluster(s) unavailability for more than 5 minutes. + description: Shoot cluster(s) unavailability detected. Need manual investigation of the root cause. Check the shoot cluster(s) and underlying infrastructure for issues. + summary: Shoot cluster(s) unavailability detected. {{- end }} ### Calico ### From 39a11e769db1b35a0dfa05fcc2a203d6696b3a85 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 17 Mar 2026 11:02:37 +0100 Subject: [PATCH 4/4] 15 mins for shoot unavailability --- .../controlplane-operations/alerts/controlplane-gardener.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 9f0ba8b..22db87b 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -7,7 +7,7 @@ groups: {{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - alert: ShootUnavailability expr: shoot:availability == 0 - for: {{ dig "ShootUnavailability" "for" "5m" .Values.prometheusRules }} + for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }}