diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index ac1dc37..8752b95 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.27 +version: 1.0.28 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 78f17f9..22db87b 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -1,17 +1,82 @@ groups: - name: controlplane-gardener rules: + +### Gardener ### + {{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - alert: ShootUnavailability expr: shoot:availability == 0 - for: {{ dig "ShootUnavailability" "for" "10m" .Values.prometheusRules }} + for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} - severity: {{ dig "ShootUnavailability" "severity" "info" .Values.prometheusRules }} + severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot cluster is unavailable for more than 10 minutes. - summary: Shoot cluster is unavailable for more than 10 minutes. + description: Shoot cluster(s) unavailability detected. Need manual investigation of the root cause. Check the shoot cluster(s) and underlying infrastructure for issues. + summary: Shoot cluster(s) unavailability detected. +{{- end }} + +### Calico ### + +{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }} + - alert: CalicoBgpNeighborSessionDown + expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} + for: {{ dig "CalicoBgpNeighborSessionDown" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoBgpNeighborSessionDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionDown.md + service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? + summary: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }} + - alert: CalicoBgpNeighborSessionAllDown + expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) == 0 + for: {{ dig "CalicoBgpNeighborSessionAllDown" "for" "10m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoBgpNeighborSessionAllDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionAllDown.md + service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Node has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? + summary: Node has no BGP neighbors. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }} + - alert: CalicoNodeMissing + expr: count(up{job="kube-kubelet"} == 1) > count(_pod_to_daemonset{owner_name="calico-node"} == 1) + for: {{ dig "CalicoNodeMissing" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoNodeMissing" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeMissing.md + service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Calico is not running on all nodes. + summary: Calico is not running on all bare metal nodes. Network datapath threatened! +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }} + - alert: CalicoNodeNotReady + expr: kube_pod_status_ready{namespace="kube-system",pod=~"calico-node-[a-zA-Z0-9]{5}",condition="false",type="shoot"} == 1 + for: {{ dig "CalicoNodeNotReady" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "CalicoNodeNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeNotReady.md + service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Calico-Node Pod is not Ready on all nodes. + summary: Calico-Node is not healthy on all bare metal nodes that are Ready. Risk of stale BGP advertisement. Network datapath threatened! {{- end }} diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index d205df4..71fb4f0 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.27 + version: 1.0.28 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.27 + version: 1.0.28 options: - name: prometheusRules.create description: Create Prometheus rules diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index 64ca625..d5ad323 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -47,6 +47,9 @@ prometheusRules: # KubernetesApiServerDown: true # KubeletDown: true + calico: + bgpNeighborCount: 2 + # ServerStuckInDiscovery: # service: "metal-api" # supportGroup: "foundation"