Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.28
version: 1.0.29
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
22 changes: 11 additions & 11 deletions charts/controlplane-operations/alerts/controlplane-gardener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ groups:

{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }}
- alert: ShootUnavailability
expr: shoot:availability == 0
expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0
for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
Expand All @@ -15,8 +15,8 @@ groups:
service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Shoot cluster(s) unavailability detected. Need manual investigation of the root cause. Check the shoot cluster(s) and underlying infrastructure for issues.
summary: Shoot cluster(s) unavailability detected.
description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues.
summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable.
{{- end }}

### Calico ###
Expand All @@ -32,8 +32,8 @@ groups:
service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }}
Expand All @@ -47,8 +47,8 @@ groups:
service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Node has no BGP neighbors.
description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }}
Expand All @@ -62,8 +62,8 @@ groups:
service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico is not running on all nodes.
summary: Calico is not running on all bare metal nodes. Network datapath threatened!
description: Calico is not running on all bare metal nodes. Network datapath threatened!
summary: Calico is not running on all nodes.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }}
Expand All @@ -77,6 +77,6 @@ groups:
service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico-Node Pod is not Ready on all nodes.
summary: Calico-Node is not healthy on all bare metal nodes that are Ready. Risk of stale BGP advertisement. Network datapath threatened!
description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
{{- end }}
15 changes: 15 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,18 @@ groups:
description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened!
summary: Bridged VLAN-tagged traffic is filtered by IPtables.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.KubernetesNodeKCPNotReady | default false) }}
- alert: KubernetesNodeKCPNotReady
expr: kube_node_status_condition{condition="Ready",node=~"kcp-.+",status="false"} == 1
for: {{ dig "KubernetesNodeKCPNotReady" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "KubernetesNodeKCPNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeKCPNotReady.md
service: {{ dig "KubernetesNodeKCPNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "KubernetesNodeKCPNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. Check node conditions and events for more details.
summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready.
{{- end }}
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.0.28
version: 1.0.29
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.0.28
version: 1.0.29
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
Loading