From 3e7cd4dc14f50ad806d0583abc6b75f2c1975f1a Mon Sep 17 00:00:00 2001 From: Softer Date: Wed, 3 Jun 2026 13:50:54 +0300 Subject: [PATCH] feat: add flagger_canary_phase metric with granular phase values flagger_canary_status collapses the 11 canary phases into 3 values (0 running, 1 successful, 2 failed), so dashboards cannot tell WaitingPromotion, Promoting, Finalising or Succeeded apart on a Grafana state-timeline. Add a new flagger_canary_phase gauge that exposes each phase as a unique value (0=Initializing ... 10=Terminated) via a deterministic phase-to-value map. SetStatus now also sets the new gauge, so every existing call site is covered without touching the scheduler. flagger_canary_status is left unchanged to avoid breaking existing dashboards and alerts. The Terminating (9) phase is recorded from the finalizer and the Terminated (10) phase from the informer delete handler, so deleted canaries keep emitting a filterable value (flagger_canary_phase < 9) instead of leaving a stale series. This addresses the stale-metric problem from #1029 without deleting metrics, which was flagged as a breaking change in #1856. Signed-off-by: Softer --- docs/gitbook/usage/monitoring.md | 8 +++++ pkg/controller/controller.go | 3 ++ pkg/controller/finalizer.go | 4 +++ pkg/controller/scheduler_metrics_test.go | 12 +++++++ pkg/metrics/recorder.go | 45 ++++++++++++++++++++++++ pkg/metrics/recorder_test.go | 24 +++++++++++++ 6 files changed, 96 insertions(+) diff --git a/docs/gitbook/usage/monitoring.md b/docs/gitbook/usage/monitoring.md index d1379991a..deb09e177 100644 --- a/docs/gitbook/usage/monitoring.md +++ b/docs/gitbook/usage/monitoring.md @@ -108,6 +108,14 @@ flagger_canary_total{namespace="test"} 1 # 0 - running, 1 - successful, 2 - failed flagger_canary_status{name="podinfo" namespace="test"} 1 +# Current canary phase gauge +# 0 - Initializing, 1 - Initialized, 2 - Waiting, 3 - Progressing, +# 4 - WaitingPromotion, 5 - Promoting, 6 - Finalising, 7 - Succeeded, +# 8 - Failed, 9 - Terminating, 10 - Terminated +# A deleted canary keeps emitting the metric with the Terminated value (10), +# so queries can filter out removed canaries with e.g. flagger_canary_phase < 9 +flagger_canary_phase{name="podinfo" namespace="test"} 7 + # Canary traffic weight gauge flagger_canary_weight{workload="podinfo-primary" namespace="test"} 95 flagger_canary_weight{workload="podinfo" namespace="test"} 5 diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 39c72c373..8939da871 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -180,6 +180,9 @@ func NewController( if ok { ctrl.logger.Infof("Deleting %s.%s from cache", r.Name, r.Namespace) ctrl.canaries.Delete(fmt.Sprintf("%s.%s", r.Name, r.Namespace)) + // record the terminated phase so deleted canaries can be filtered + // out of the flagger_canary_phase metric (e.g. flagger_canary_phase < 9) + ctrl.recorder.SetPhase(&r, flaggerv1.CanaryPhaseTerminated) } }, }) diff --git a/pkg/controller/finalizer.go b/pkg/controller/finalizer.go index 57ab04863..e3f82123f 100644 --- a/pkg/controller/finalizer.go +++ b/pkg/controller/finalizer.go @@ -51,6 +51,10 @@ func (c *Controller) finalize(old interface{}) error { return fmt.Errorf("failed to update status: %w", err) } + // record the terminating phase on the flagger_canary_phase metric; + // the final Terminated phase is recorded by the informer delete handler + c.recorder.SetPhase(canary, flaggerv1.CanaryPhaseTerminating) + // record event c.recordEventInfof(canary, "Terminating canary %s.%s", canary.Name, canary.Namespace) } diff --git a/pkg/controller/scheduler_metrics_test.go b/pkg/controller/scheduler_metrics_test.go index 1f7cd67de..df876bd30 100644 --- a/pkg/controller/scheduler_metrics_test.go +++ b/pkg/controller/scheduler_metrics_test.go @@ -197,6 +197,9 @@ func TestController_MetricsStateTransition(t *testing.T) { actualStatus := testutil.ToFloat64(mocks.ctrl.recorder.GetStatusMetric().WithLabelValues("podinfo", "default")) assert.Equal(t, float64(1), actualStatus) + actualPhase := testutil.ToFloat64(mocks.ctrl.recorder.GetPhaseMetric().WithLabelValues("podinfo", "default")) + assert.Equal(t, float64(1), actualPhase) // Initialized + actualTotal := testutil.ToFloat64(mocks.ctrl.recorder.GetTotalMetric().WithLabelValues("default")) assert.GreaterOrEqual(t, actualTotal, float64(0)) dep2 := newDeploymentTestDeploymentV2() @@ -210,6 +213,9 @@ func TestController_MetricsStateTransition(t *testing.T) { actualStatus = testutil.ToFloat64(mocks.ctrl.recorder.GetStatusMetric().WithLabelValues("podinfo", "default")) assert.Equal(t, float64(0), actualStatus) + actualPhase = testutil.ToFloat64(mocks.ctrl.recorder.GetPhaseMetric().WithLabelValues("podinfo", "default")) + assert.Equal(t, float64(3), actualPhase) // Progressing + actualPrimaryWeight := testutil.ToFloat64(mocks.ctrl.recorder.GetWeightMetric().WithLabelValues("podinfo-primary", "default")) actualCanaryWeight := testutil.ToFloat64(mocks.ctrl.recorder.GetWeightMetric().WithLabelValues("podinfo", "default")) @@ -234,6 +240,9 @@ func TestController_MetricsStateTransition(t *testing.T) { successCount := testutil.ToFloat64(mocks.ctrl.recorder.GetSuccessesMetric().WithLabelValues("podinfo", "default", "canary", "completed")) assert.Equal(t, float64(1), successCount) + + actualPhase = testutil.ToFloat64(mocks.ctrl.recorder.GetPhaseMetric().WithLabelValues("podinfo", "default")) + assert.Equal(t, float64(7), actualPhase) // Succeeded }) t.Run("failed canary rollback with count metrics", func(t *testing.T) { @@ -270,6 +279,9 @@ func TestController_MetricsStateTransition(t *testing.T) { actualStatus := testutil.ToFloat64(mocks.ctrl.recorder.GetStatusMetric().WithLabelValues("podinfo", "default")) assert.Equal(t, float64(2), actualStatus) + actualPhase := testutil.ToFloat64(mocks.ctrl.recorder.GetPhaseMetric().WithLabelValues("podinfo", "default")) + assert.Equal(t, float64(8), actualPhase) // Failed + actualPrimaryWeight := testutil.ToFloat64(mocks.ctrl.recorder.GetWeightMetric().WithLabelValues("podinfo-primary", "default")) actualCanaryWeight := testutil.ToFloat64(mocks.ctrl.recorder.GetWeightMetric().WithLabelValues("podinfo", "default")) assert.Equal(t, float64(100), actualPrimaryWeight) diff --git a/pkg/metrics/recorder.go b/pkg/metrics/recorder.go index 85e4d6e0a..81f21b710 100644 --- a/pkg/metrics/recorder.go +++ b/pkg/metrics/recorder.go @@ -37,6 +37,24 @@ const ( AnalysisStatusSkipped = "skipped" ) +// canaryPhaseValues maps each canary phase to a unique integer value +// exposed by the flagger_canary_phase metric. Unlike flagger_canary_status +// (which collapses all phases into running/successful/failed), this mapping +// keeps every phase distinct so they can be rendered on a Grafana state-timeline. +var canaryPhaseValues = map[flaggerv1.CanaryPhase]float64{ + flaggerv1.CanaryPhaseInitializing: 0, + flaggerv1.CanaryPhaseInitialized: 1, + flaggerv1.CanaryPhaseWaiting: 2, + flaggerv1.CanaryPhaseProgressing: 3, + flaggerv1.CanaryPhaseWaitingPromotion: 4, + flaggerv1.CanaryPhasePromoting: 5, + flaggerv1.CanaryPhaseFinalising: 6, + flaggerv1.CanaryPhaseSucceeded: 7, + flaggerv1.CanaryPhaseFailed: 8, + flaggerv1.CanaryPhaseTerminating: 9, + flaggerv1.CanaryPhaseTerminated: 10, +} + // CanaryMetricLabels holds labels for canary metrics type CanaryMetricLabels struct { Name string @@ -56,6 +74,7 @@ type Recorder struct { duration *prometheus.HistogramVec total *prometheus.GaugeVec status *prometheus.GaugeVec + phase *prometheus.GaugeVec weight *prometheus.GaugeVec analysis *prometheus.GaugeVec successes *prometheus.CounterVec @@ -90,6 +109,14 @@ func NewRecorder(controller string, register bool) Recorder { Help: "Last canary analysis result", }, []string{"name", "namespace"}) + phase := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: controller, + Name: "canary_phase", + Help: "Current canary phase " + + "(0=Initializing, 1=Initialized, 2=Waiting, 3=Progressing, 4=WaitingPromotion, " + + "5=Promoting, 6=Finalising, 7=Succeeded, 8=Failed, 9=Terminating, 10=Terminated)", + }, []string{"name", "namespace"}) + weight := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: controller, Name: "canary_weight", @@ -119,6 +146,7 @@ func NewRecorder(controller string, register bool) Recorder { prometheus.MustRegister(duration) prometheus.MustRegister(total) prometheus.MustRegister(status) + prometheus.MustRegister(phase) prometheus.MustRegister(weight) prometheus.MustRegister(analysis) prometheus.MustRegister(successes) @@ -130,6 +158,7 @@ func NewRecorder(controller string, register bool) Recorder { duration: duration, total: total, status: status, + phase: phase, weight: weight, analysis: analysis, successes: successes, @@ -168,6 +197,17 @@ func (cr *Recorder) SetStatus(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) status = 1 } cr.status.WithLabelValues(cd.Spec.TargetRef.Name, cd.Namespace).Set(float64(status)) + cr.SetPhase(cd, phase) +} + +// SetPhase sets the canary phase as a unique value per phase, see canaryPhaseValues. +// Unknown phases are ignored to avoid recording a misleading value. +func (cr *Recorder) SetPhase(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) { + value, ok := canaryPhaseValues[phase] + if !ok { + return + } + cr.phase.WithLabelValues(cd.Spec.TargetRef.Name, cd.Namespace).Set(value) } // SetWeight sets the weight values for primary and canary destinations @@ -191,6 +231,11 @@ func (cr *Recorder) GetStatusMetric() *prometheus.GaugeVec { return cr.status } +// GetPhaseMetric returns the phase metric +func (cr *Recorder) GetPhaseMetric() *prometheus.GaugeVec { + return cr.phase +} + // GetWeightMetric returns the weight metric func (cr *Recorder) GetWeightMetric() *prometheus.GaugeVec { return cr.weight diff --git a/pkg/metrics/recorder_test.go b/pkg/metrics/recorder_test.go index c98ff1231..478a8c37f 100644 --- a/pkg/metrics/recorder_test.go +++ b/pkg/metrics/recorder_test.go @@ -66,6 +66,30 @@ func TestRecorder_GetterMethodsWithData(t *testing.T) { expected: 1.0, checkValue: true, }, + { + name: "SetAndGetPhase", + setupFunc: func(r Recorder) { r.SetPhase(canary, flaggerv1.CanaryPhaseWaitingPromotion) }, + getterFunc: func(r Recorder) interface{} { return r.GetPhaseMetric() }, + labels: []string{"podinfo", "default"}, + expected: 4.0, + checkValue: true, + }, + { + name: "SetAndGetPhaseTerminating", + setupFunc: func(r Recorder) { r.SetPhase(canary, flaggerv1.CanaryPhaseTerminating) }, + getterFunc: func(r Recorder) interface{} { return r.GetPhaseMetric() }, + labels: []string{"podinfo", "default"}, + expected: 9.0, + checkValue: true, + }, + { + name: "SetAndGetPhaseTerminated", + setupFunc: func(r Recorder) { r.SetPhase(canary, flaggerv1.CanaryPhaseTerminated) }, + getterFunc: func(r Recorder) interface{} { return r.GetPhaseMetric() }, + labels: []string{"podinfo", "default"}, + expected: 10.0, + checkValue: true, + }, { name: "SetAndGetTotal", setupFunc: func(r Recorder) { r.SetTotal("default", 3) },