diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 8752b95..df16ecf 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.28 +version: 1.0.29 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 22db87b..67374d1 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -6,7 +6,7 @@ groups: {{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - alert: ShootUnavailability - expr: shoot:availability == 0 + expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0 for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -15,8 +15,8 @@ groups: service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot cluster(s) unavailability detected. Need manual investigation of the root cause. Check the shoot cluster(s) and underlying infrastructure for issues. - summary: Shoot cluster(s) unavailability detected. + description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues. + summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. {{- end }} ### Calico ### @@ -32,8 +32,8 @@ groups: service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? - summary: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. + description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? + summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }} @@ -47,8 +47,8 @@ groups: service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? - summary: Node has no BGP neighbors. + description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? + summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }} @@ -62,8 +62,8 @@ groups: service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico is not running on all nodes. - summary: Calico is not running on all bare metal nodes. Network datapath threatened! + description: Calico is not running on all bare metal nodes. Network datapath threatened! + summary: Calico is not running on all nodes. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }} @@ -77,6 +77,6 @@ groups: service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico-Node Pod is not Ready on all nodes. - summary: Calico-Node is not healthy on all bare metal nodes that are Ready. Risk of stale BGP advertisement. Network datapath threatened! + description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! + summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. {{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 6c85752..49c85f7 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -15,3 +15,18 @@ groups: description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened! summary: Bridged VLAN-tagged traffic is filtered by IPtables. {{- end }} + +{{- if not (.Values.prometheusRules.disabled.KubernetesNodeKCPNotReady | default false) }} + - alert: KubernetesNodeKCPNotReady + expr: kube_node_status_condition{condition="Ready",node=~"kcp-.+",status="false"} == 1 + for: {{ dig "KubernetesNodeKCPNotReady" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "KubernetesNodeKCPNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeKCPNotReady.md + service: {{ dig "KubernetesNodeKCPNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "KubernetesNodeKCPNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. Check node conditions and events for more details. + summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. +{{- end }} diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 71fb4f0..933779a 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.28 + version: 1.0.29 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.28 + version: 1.0.29 options: - name: prometheusRules.create description: Create Prometheus rules