diff --git a/.github/workflows/validate-prometheus-alerts.yaml b/.github/workflows/validate-prometheus-alerts.yaml new file mode 100644 index 0000000..59173d1 --- /dev/null +++ b/.github/workflows/validate-prometheus-alerts.yaml @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +name: Validate Prometheus Alerts + +on: + workflow_dispatch: + pull_request: + paths: + - 'charts/openstack-hypervisor-operator/alerts/*.yaml' + - 'charts/openstack-hypervisor-operator/alerts/*.yml' + +permissions: + contents: read + +env: + PROMTOOL_VERSION: 3.8.0 + +defaults: + run: + shell: bash + +concurrency: + group: validate-prometheus-alerts-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate-alerts: + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v6 + + - name: Install promtool + run: | + set -euo pipefail + curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v${PROMTOOL_VERSION}/prometheus-${PROMTOOL_VERSION}.linux-amd64.tar.gz" \ + | tar xz --strip-components=1 "prometheus-${PROMTOOL_VERSION}.linux-amd64/promtool" + sudo install -m 0755 promtool /usr/local/bin/promtool + promtool --version + + - name: Validate Prometheus alert rules + run: | + set -euo pipefail + shopt -s nullglob + files=(charts/openstack-hypervisor-operator/alerts/*.{yaml,yml}) + if [ ${#files[@]} -eq 0 ]; then + echo "No Prometheus rule files found." + exit 1 + fi + promtool check rules "${files[@]}" diff --git a/charts/openstack-hypervisor-operator/alerts/eviction.yaml b/charts/openstack-hypervisor-operator/alerts/eviction.yaml new file mode 100644 index 0000000..127bdf1 --- /dev/null +++ b/charts/openstack-hypervisor-operator/alerts/eviction.yaml @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +groups: +- name: evictionLifecycle + rules: + - alert: EvictionFailed + expr: | + kube_customresource_eviction_condition{condition="Evicting", reason="Failed"} == 1 + for: 5m + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Eviction {{ $labels.name }} has failed" + description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack." + + - alert: EvictionMigrationFailing + expr: | + kube_customresource_eviction_condition{condition="MigratingInstance", reason="Failed"} == 1 + and on (name) + kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1 + for: 1h + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour" + description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress." + + - alert: EvictionOutstandingRamHigh + expr: | + kube_customresource_eviction_outstanding_ram_mb > 0 + and on (name) + kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1 + for: 6h + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours" + description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved." diff --git a/charts/openstack-hypervisor-operator/alerts/operator.yaml b/charts/openstack-hypervisor-operator/alerts/operator.yaml new file mode 100644 index 0000000..c932232 --- /dev/null +++ b/charts/openstack-hypervisor-operator/alerts/operator.yaml @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +groups: +- name: hypervisorLifecycle + rules: + - alert: HypervisorOnboardingStuck + expr: | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 1 + for: 1h + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour" + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync." + + - alert: HypervisorEvictionStuck + expr: | + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 + for: 4h + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours" + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs." + + - alert: HypervisorEvictedTooLong + expr: | + kube_customresource_hypervisor_evicted == 1 + unless on (name) + kube_customresource_hypervisor_condition{condition="Offboarded"} == 1 + for: 7d + labels: + severity: info + type: hypervisor_operator + annotations: + summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days" + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning." + +- name: hypervisorSync + rules: + - alert: HypervisorTraitSyncFailed + expr: | + kube_customresource_hypervisor_condition{condition="TraitsUpdated"} == 0 + and on (name) + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 + for: 30m + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Hypervisor {{ $labels.name }} trait sync has been failing" + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity." + + - alert: HypervisorAggregateSyncFailed + expr: | + kube_customresource_hypervisor_condition{condition="AggregatesUpdated"} == 0 + and on (name) + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 + unless on (name) + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 + for: 30m + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing" + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity." + +- name: hypervisorOperatorHealth + rules: + - alert: HypervisorOperatorReconcileErrors + expr: | + rate(controller_runtime_reconcile_errors_total[5m]) > 0.01 + for: 15m + labels: + severity: warning + type: hypervisor_operator + annotations: + summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors" + description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes." + + - alert: HypervisorOperatorDown + expr: | + up{job=~".*hypervisor-operator.*"} == 0 + for: 5m + labels: + severity: critical + type: hypervisor_operator + annotations: + summary: "Hypervisor operator is down" + description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes." diff --git a/charts/openstack-hypervisor-operator/dashboards/hypervisor-overview.json b/charts/openstack-hypervisor-operator/dashboards/hypervisor-overview.json new file mode 100644 index 0000000..00d5759 --- /dev/null +++ b/charts/openstack-hypervisor-operator/dashboards/hypervisor-overview.json @@ -0,0 +1,595 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "hypervisor-overview", + "project": "default" + }, + "spec": { + "display": { + "name": "Hypervisor Overview" + }, + "duration": "1h", + "refreshInterval": "30s", + "variables": [ + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Zone" + }, + "allowAllValue": true, + "allowMultiple": true, + "defaultValue": ["$__all"], + "name": "zone", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "labelName": "zone", + "matchers": ["kube_customresource_hypervisor_info"] + } + } + } + }, + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Building Block" + }, + "allowAllValue": true, + "allowMultiple": true, + "defaultValue": ["$__all"], + "name": "building_block", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "labelName": "building_block", + "matchers": ["kube_customresource_hypervisor_info"] + } + } + } + } + ], + "panels": { + "totalHypervisors": { + "kind": "Panel", + "spec": { + "display": { + "name": "Total Hypervisors" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "count(kube_customresource_hypervisor_info{zone=~\"$zone\", building_block=~\"$building_block\"})" + } + } + } + } + ] + } + }, + "readyHypervisors": { + "kind": "Panel", + "spec": { + "display": { + "name": "Ready Hypervisors" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "count(kube_customresource_hypervisor_condition{condition=\"Ready\", zone=~\"$zone\", building_block=~\"$building_block\"} == 1)" + } + } + } + } + ] + } + }, + "notReadyHypervisors": { + "kind": "Panel", + "spec": { + "display": { + "name": "Not Ready Hypervisors" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "count(kube_customresource_hypervisor_condition{condition=\"Ready\", zone=~\"$zone\", building_block=~\"$building_block\"} == 0) or vector(0)" + } + } + } + } + ] + } + }, + "evictedHypervisors": { + "kind": "Panel", + "spec": { + "display": { + "name": "Evicted Hypervisors" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "count(kube_customresource_hypervisor_evicted{zone=~\"$zone\", building_block=~\"$building_block\"} == 1) or vector(0)" + } + } + } + } + ] + } + }, + "totalInstances": { + "kind": "Panel", + "spec": { + "display": { + "name": "Total Instances" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "sum(kube_customresource_hypervisor_num_instances{zone=~\"$zone\", building_block=~\"$building_block\"})" + } + } + } + } + ] + } + }, + "totalCpuCapacity": { + "kind": "Panel", + "spec": { + "display": { + "name": "Total CPU Capacity" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last" + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "sum(kube_customresource_hypervisor_capability_cpus{zone=~\"$zone\", building_block=~\"$building_block\"})" + } + } + } + } + ] + } + }, + "instancesPerHypervisor": { + "kind": "Panel", + "spec": { + "display": { + "name": "Instances per Hypervisor" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "kube_customresource_hypervisor_num_instances{zone=~\"$zone\", building_block=~\"$building_block\"}", + "seriesNameFormat": "{{name}}" + } + } + } + } + ] + } + }, + "cpuCapacityByZone": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU Capacity by Zone" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "sum by (zone) (kube_customresource_hypervisor_capability_cpus{zone=~\"$zone\", building_block=~\"$building_block\"})", + "seriesNameFormat": "{{zone}}" + } + } + } + } + ] + } + }, + "hypervisorConditions": { + "kind": "Panel", + "spec": { + "display": { + "name": "Hypervisor Conditions" + }, + "plugin": { + "kind": "Table", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "kube_customresource_hypervisor_condition{zone=~\"$zone\", building_block=~\"$building_block\"}" + } + } + } + } + ] + } + }, + "hypervisorsByZone": { + "kind": "Panel", + "spec": { + "display": { + "name": "Hypervisors by Zone" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "count by (zone) (kube_customresource_hypervisor_info{zone=~\"$zone\", building_block=~\"$building_block\"})", + "seriesNameFormat": "{{zone}}" + } + } + } + } + ] + } + }, + "reconcileRate": { + "kind": "Panel", + "spec": { + "display": { + "name": "Reconciliation Rate" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "sum by (controller, result) (rate(controller_runtime_reconcile_total{job=~\".*hypervisor-operator.*\"}[5m]))", + "seriesNameFormat": "{{controller}} - {{result}}" + } + } + } + } + ] + } + }, + "reconcileErrors": { + "kind": "Panel", + "spec": { + "display": { + "name": "Reconciliation Errors" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "sum by (controller) (rate(controller_runtime_reconcile_errors_total{job=~\".*hypervisor-operator.*\"}[5m]))", + "seriesNameFormat": "{{controller}}" + } + } + } + } + ] + } + }, + "reconcileDuration": { + "kind": "Panel", + "spec": { + "display": { + "name": "Reconciliation Duration (p99)" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "yAxis": { + "format": { + "unit": "seconds" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "histogram_quantile(0.99, sum by (controller, le) (rate(controller_runtime_reconcile_time_seconds_bucket{job=~\".*hypervisor-operator.*\"}[5m])))", + "seriesNameFormat": "{{controller}}" + } + } + } + } + ] + } + }, + "workqueueDepth": { + "kind": "Panel", + "spec": { + "display": { + "name": "Workqueue Depth" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "workqueue_depth{job=~\".*hypervisor-operator.*\"}", + "seriesNameFormat": "{{name}}" + } + } + } + } + ] + } + }, + "lifecycleAndHa": { + "kind": "Panel", + "spec": { + "display": { + "name": "Lifecycle & HA Status" + }, + "plugin": { + "kind": "Table", + "spec": {} + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "kube_customresource_hypervisor_lifecycle_enabled{zone=~\"$zone\", building_block=~\"$building_block\"}" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "datasource": { + "kind": "PrometheusDatasource", + "name": "prometheus" + }, + "query": "kube_customresource_hypervisor_high_availability{zone=~\"$zone\", building_block=~\"$building_block\"}" + } + } + } + } + ] + } + } + }, + "layouts": [ + { + "kind": "Grid", + "spec": { + "display": { + "title": "Overview", + "collapse": { + "open": true + } + }, + "items": [ + { "x": 0, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/totalHypervisors" } }, + { "x": 4, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/readyHypervisors" } }, + { "x": 8, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/notReadyHypervisors" } }, + { "x": 12, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/evictedHypervisors" } }, + { "x": 16, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/totalInstances" } }, + { "x": 20, "y": 0, "width": 4, "height": 4, "content": { "$ref": "#/spec/panels/totalCpuCapacity" } } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "Hypervisor Details", + "collapse": { + "open": true + } + }, + "items": [ + { "x": 0, "y": 0, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/instancesPerHypervisor" } }, + { "x": 12, "y": 0, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/cpuCapacityByZone" } }, + { "x": 0, "y": 8, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/hypervisorsByZone" } }, + { "x": 12, "y": 8, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/hypervisorConditions" } }, + { "x": 0, "y": 16, "width": 24, "height": 8, "content": { "$ref": "#/spec/panels/lifecycleAndHa" } } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "Operator Health", + "collapse": { + "open": true + } + }, + "items": [ + { "x": 0, "y": 0, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/reconcileRate" } }, + { "x": 12, "y": 0, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/reconcileErrors" } }, + { "x": 0, "y": 8, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/reconcileDuration" } }, + { "x": 12, "y": 8, "width": 12, "height": 8, "content": { "$ref": "#/spec/panels/workqueueDepth" } } + ] + } + } + ] + } +} diff --git a/charts/openstack-hypervisor-operator/templates/_helpers.tpl b/charts/openstack-hypervisor-operator/templates/_helpers.tpl index 9b4ee5d..101bd50 100644 --- a/charts/openstack-hypervisor-operator/templates/_helpers.tpl +++ b/charts/openstack-hypervisor-operator/templates/_helpers.tpl @@ -60,3 +60,60 @@ Create the name of the service account to use {{- default "default" .Values.serviceAccount.name }} {{- end }} {{- end }} + +{{/* +Additional labels injected into every Prometheus alert rule body. +*/}} +{{- define "openstack-hypervisor-operator.additionalRuleLabels" -}} +{{- with .Values.prometheusRules.additionalRuleLabels }} +{{ toYaml . }} +{{- end }} +{{- end -}} + +{{/* +Selector labels applied to PrometheusRule metadata for Prometheus Operator discovery. +*/}} +{{- define "openstack-hypervisor-operator.ruleSelectorLabels" -}} +{{- $root := index . 1 -}} +{{- with $root.Values.prometheusRules.ruleSelectors }} +{{- range $i, $target := . }} +{{ $target.name | required (printf "$.Values.prometheusRules.ruleSelectors[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.prometheusRules.ruleSelectors[%v].value missing" $i)) $root }} +{{- end }} +{{- end }} +{{- end -}} + +{{/* +Selector labels applied to dashboard ConfigMap metadata for Perses discovery. +*/}} +{{- define "openstack-hypervisor-operator.dashboardSelectorLabels" -}} +{{- $root := index . 1 -}} +{{- with $root.Values.dashboards.dashboardSelectors }} +{{- range $i, $target := . }} +{{ $target.name | required (printf "$.Values.dashboards.dashboardSelectors[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.dashboards.dashboardSelectors[%v].value missing" $i)) $root }} +{{- end }} +{{- end }} +{{- end -}} + +{{/* +Selector labels applied to global dashboard ConfigMap metadata. +*/}} +{{- define "openstack-hypervisor-operator.globalDashboardSelectorLabels" -}} +{{- $root := index . 1 -}} +{{- with $root.Values.dashboards.global.dashboardSelectors }} +{{- range $i, $target := . }} +{{ $target.name | required (printf "$.Values.dashboards.global.dashboardSelectors[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.dashboards.global.dashboardSelectors[%v].value missing" $i)) $root }} +{{- end }} +{{- end }} +{{- end -}} + +{{/* +Common labels for monitoring resources (alerts, dashboards). +*/}} +{{- define "openstack-hypervisor-operator.monitoringLabels" -}} +{{- $root := index . 1 -}} +app.kubernetes.io/version: {{ $root.Chart.Version }} +app.kubernetes.io/part-of: {{ $root.Release.Name }} +{{- with $root.Values.global.commonLabels }} +{{ toYaml . }} +{{- end }} +{{- end -}} diff --git a/charts/openstack-hypervisor-operator/templates/alerts.yaml b/charts/openstack-hypervisor-operator/templates/alerts.yaml new file mode 100644 index 0000000..7562bb9 --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/alerts.yaml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.prometheusRules.create -}} +{{- $root := . -}} +{{- $disabled := $root.Values.prometheusRules.disabled | default dict -}} +{{- $additionalRuleLabels := $root.Values.prometheusRules.additionalRuleLabels | default dict -}} +{{- $docStarted := false }} + +{{- range $alertPath, $alertRaw := $root.Files.Glob "alerts/*.yaml" }} +{{- $alertName := base $alertPath }} +{{- $parsed := fromYaml (printf "%s" $alertRaw) }} + +{{/* Filter disabled alerts and inject additionalRuleLabels */}} +{{- $filteredGroups := list }} +{{- range $group := $parsed.groups }} + {{- $filteredRules := list }} + {{- range $rule := $group.rules }} + {{- if not (index $disabled $rule.alert) }} + {{- if $additionalRuleLabels }} + {{- $mergedLabels := merge (deepCopy $additionalRuleLabels) $rule.labels }} + {{- $_ := set $rule "labels" $mergedLabels }} + {{- end }} + {{- $filteredRules = append $filteredRules $rule }} + {{- end }} + {{- end }} + {{- if $filteredRules }} + {{- $filteredGroup := dict "name" $group.name "rules" $filteredRules }} + {{- $filteredGroups = append $filteredGroups $filteredGroup }} + {{- end }} +{{- end }} + +{{- if $filteredGroups }} +{{- if $docStarted }} +--- +{{- end }} +{{- $docStarted = true }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s-alerts" $root.Release.Name (trimSuffix ".yaml" $alertName) | trunc 63 | lower }} + labels: +{{- include "openstack-hypervisor-operator.ruleSelectorLabels" (list $alertPath $root) | indent 4 }} +{{ include "openstack-hypervisor-operator.monitoringLabels" (list $alertPath $root) | indent 4 }} +{{- with $root.Values.prometheusRules.labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- with $root.Values.prometheusRules.annotations }} + annotations: +{{ toYaml . | indent 4 }} +{{- end }} +spec: + groups: +{{ toYaml $filteredGroups | indent 4 }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/templates/dashboards.yaml b/charts/openstack-hypervisor-operator/templates/dashboards.yaml new file mode 100644 index 0000000..a52c9dd --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/dashboards.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.dashboards.create }} +{{ $root := . }} +{{- range $path, $bytes := .Files.Glob "dashboards/*.json" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" $root.Release.Name ($path | replace ".json" "" | replace "/" "-") | trunc 63 | trimSuffix "-" }} + labels: +{{- include "openstack-hypervisor-operator.dashboardSelectorLabels" (list $path $root) | indent 4 }} +{{ include "openstack-hypervisor-operator.monitoringLabels" (list $path $root) | indent 4 }} +data: +{{ printf "%s: |-" ($path | replace "/" "-" | indent 2) }} +{{ printf "%s" $bytes | indent 4 }} +{{- end }} +{{- end }} + +{{- if .Values.dashboards.global.create }} +{{ $root := . }} +{{- range $path, $bytes := .Files.Glob "dashboards/*.json" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s-global" $root.Release.Name ($path | replace ".json" "" | replace "/" "-") | trunc 63 | trimSuffix "-" }} + labels: +{{- include "openstack-hypervisor-operator.globalDashboardSelectorLabels" (list $path $root) | indent 4 }} +{{ include "openstack-hypervisor-operator.monitoringLabels" (list $path $root) | indent 4 }} +data: +{{ printf "%s: |-" ($path | replace "/" "-" | indent 2) }} +{{ printf "%s" $bytes | indent 4 }} +{{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/templates/metrics-eviction-cm.yaml b/charts/openstack-hypervisor-operator/templates/metrics-eviction-cm.yaml new file mode 100644 index 0000000..8022af7 --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/metrics-eviction-cm.yaml @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.customResourceMetrics.create }} +{{- $disabled := .Values.customResourceMetrics.disabled | default dict }} +{{- if not (index $disabled "eviction") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-eviction-resources" .Release.Name | trunc 63 | trimSuffix "-" }} + labels: + dev.custom.kube-state-metrics: "true" +{{ include "openstack-hypervisor-operator.labels" . | indent 4 }} +data: + eviction-metrics.yaml: | + spec: + resources: + - groupVersionKind: + group: kvm.cloud.sap + version: v1 + kind: Eviction + labelsFromPath: + name: [metadata, name] + metrics: + - name: eviction_info + help: "Info metric for eviction with hypervisor and reason labels (always 1)" + each: + type: Info + info: + labelsFromPath: + hypervisor: [spec, hypervisor] + reason: [spec, reason] + - name: eviction_outstanding_ram_mb + help: "Outstanding RAM in MB to be migrated" + each: + type: Gauge + gauge: + path: [status, outstandingRamMb] + nilIsZero: true + - name: eviction_condition + help: "Eviction condition status (1=True, 0=False)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition: [type] + reason: [reason] + valueFrom: [status] + booleanTrue: "True" + booleanFalse: "False" +{{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/templates/metrics-hypervisor-cm.yaml b/charts/openstack-hypervisor-operator/templates/metrics-hypervisor-cm.yaml new file mode 100644 index 0000000..c894443 --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/metrics-hypervisor-cm.yaml @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.customResourceMetrics.create }} +{{- $disabled := .Values.customResourceMetrics.disabled | default dict }} +{{- if not (index $disabled "hypervisor") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-hypervisor-resources" .Release.Name | trunc 63 | trimSuffix "-" }} + labels: + dev.custom.kube-state-metrics: "true" +{{ include "openstack-hypervisor-operator.labels" . | indent 4 }} +data: + hypervisor-metrics.yaml: | + spec: + resources: + - groupVersionKind: + group: kvm.cloud.sap + version: v1 + kind: Hypervisor + labelsFromPath: + name: [metadata, name] + zone: [metadata, labels, "topology.kubernetes.io/zone"] + building_block: [metadata, labels, "kubernetes.metal.cloud.sap/bb"] + worker_group: [metadata, labels, "worker.garden.sapcloud.io/group"] + maintenance: [spec, maintenance] + metrics: + - name: hypervisor_info + help: "Info metric for hypervisor with version label (always 1)" + each: + type: Info + info: + labelsFromPath: + hypervisor_version: [status, hypervisorVersion] + libvirt_version: [status, libVirtVersion] + hardware_model: [status, operatingSystem, hardwareModel] + os_version: [status, operatingSystem, prettyVersion] + - name: hypervisor_num_instances + help: "Number of instances on the hypervisor" + each: + type: Gauge + gauge: + path: [status, numInstances] + nilIsZero: true + - name: hypervisor_evicted + help: "1 if hypervisor is evicted" + each: + type: Gauge + gauge: + path: [status, evicted] + booleanTrue: "true" + booleanFalse: "false" + nilIsZero: true + - name: hypervisor_condition + help: "Hypervisor condition status (1=True, 0=False)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition: [type] + reason: [reason] + valueFrom: [status] + booleanTrue: "True" + booleanFalse: "False" + - name: hypervisor_capability_cpus + help: "Total CPUs available on the hypervisor" + each: + type: Gauge + gauge: + path: [status, capabilities, cpus] + nilIsZero: true + - name: hypervisor_lifecycle_enabled + help: "1 if lifecycle management is enabled" + each: + type: Gauge + gauge: + path: [spec, lifecycleEnabled] + booleanTrue: "true" + booleanFalse: "false" + nilIsZero: true + - name: hypervisor_high_availability + help: "1 if high availability is enabled" + each: + type: Gauge + gauge: + path: [spec, highAvailability] + booleanTrue: "true" + booleanFalse: "false" + nilIsZero: true +{{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/values.yaml b/charts/openstack-hypervisor-operator/values.yaml index 23a2583..a6a53cd 100644 --- a/charts/openstack-hypervisor-operator/values.yaml +++ b/charts/openstack-hypervisor-operator/values.yaml @@ -53,3 +53,38 @@ serviceAccount: automount: true create: true name: "" + +global: + commonLabels: {} + +prometheusRules: + create: true + ruleSelectors: {} + labels: {} + annotations: {} + additionalRuleLabels: {} + # Disable individual alerts by name: + # disabled: + # HypervisorOnboardingStuck: true + # HypervisorEvictedTooLong: true + # EvictionFailed: true + # EvictionMigrationFailing: true + # EvictionOutstandingRamHigh: true + disabled: {} + +dashboards: + create: true + global: + create: false + dashboardSelectors: [] + dashboardSelectors: + - name: perses.dev/resource + value: '"true"' + +customResourceMetrics: + create: true + # Disable individual custom resource metrics ConfigMaps by name: + # disabled: + # hypervisor: true + # eviction: true + disabled: {}