From 120c187938b766a99ab783e522d4b1308fe63ddd Mon Sep 17 00:00:00 2001 From: Pedram Pourmohammad Date: Fri, 12 Jun 2026 00:15:01 +0330 Subject: [PATCH] Keep canary alive when primary promotion fails When the canary analysis succeeds, Flagger copies the canary pod spec to the primary and waits for the primary rollout to finish. If the primary fails to become ready, the non-retriable readiness error triggered the standard analysis rollback, which routes all traffic to the primary and scales the canary to zero. During promotion the primary already runs the new (failing) spec while the canary is the only healthy copy of the new revision still serving traffic. Rolling back therefore sends all traffic to the broken primary and deletes the working canary, taking the application down. Halt the promotion instead: when the primary is not ready and the canary is in the Promoting or Finalising phase, mark the rollout as failed and alert, but keep the canary running and leave routing untouched until the primary recovers or a corrected revision is applied. Fixes #1898 Signed-off-by: Pedram Pourmohammad --- pkg/controller/scheduler.go | 32 +++++++++- .../scheduler_deployment_fixture_test.go | 33 ++++++++++ pkg/controller/scheduler_deployment_test.go | 61 +++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 12a94befc..15904a3a4 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -297,7 +297,14 @@ func (c *Controller) advanceCanary(name string, namespace string) { if err != nil { c.recordEventWarningf(cd, "%v", err) if !retriable { - c.rollback(cd, canaryController, meshRouter, scalerReconciler) + // during promotion the canary is the only healthy copy, halt + // instead of rolling back traffic to the unhealthy primary + if cd.Status.Phase == flaggerv1.CanaryPhasePromoting || + cd.Status.Phase == flaggerv1.CanaryPhaseFinalising { + c.promotionFailed(cd, canaryController, err) + } else { + c.rollback(cd, canaryController, meshRouter, scalerReconciler) + } } return } @@ -989,6 +996,29 @@ func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary. c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed) } +// promotionFailed marks the rollout as failed when the primary is unhealthy +// during promotion, without scaling the canary down or routing to the primary. +func (c *Controller) promotionFailed(canary *flaggerv1.Canary, canaryController canary.Controller, err error) { + c.recordEventWarningf(canary, "Promotion of %s.%s failed, primary not ready: %v", + canary.Spec.TargetRef.Name, canary.Namespace, err) + c.alert(canary, fmt.Sprintf("Promotion failed, primary not ready: %v", err), + false, flaggerv1.SeverityError) + + if err := canaryController.SetStatusPhase(canary, flaggerv1.CanaryPhaseFailed); err != nil { + c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err) + return + } + + c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed) + c.recorder.IncFailures(metrics.CanaryMetricLabels{ + Name: canary.Spec.TargetRef.Name, + Namespace: canary.Namespace, + DeploymentStrategy: canary.DeploymentStrategy(), + AnalysisStatus: metrics.AnalysisStatusCompleted, + }) + c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed) +} + func (c *Controller) setPhaseInitialized(cd *flaggerv1.Canary, canaryController canary.Controller) error { if cd.Status.Phase == "" || cd.Status.Phase == flaggerv1.CanaryPhaseInitializing { cd.Status.Phase = flaggerv1.CanaryPhaseInitialized diff --git a/pkg/controller/scheduler_deployment_fixture_test.go b/pkg/controller/scheduler_deployment_fixture_test.go index a4e046224..4174737bf 100644 --- a/pkg/controller/scheduler_deployment_fixture_test.go +++ b/pkg/controller/scheduler_deployment_fixture_test.go @@ -82,6 +82,39 @@ func (f fixture) makeReady(t *testing.T, name string) { require.NoError(t, err) } +// makePrimaryNotReady puts the primary into a stuck rollout (progress deadline exceeded). +func (f fixture) makePrimaryNotReady(t *testing.T) { + primaryName := fmt.Sprintf("%s-primary", f.canary.Spec.TargetRef.Name) + p, err := f.kubeClient.AppsV1(). + Deployments("default"). + Get(context.TODO(), primaryName, metav1.GetOptions{}) + require.NoError(t, err) + + p.Status = appsv1.DeploymentStatus{ + ObservedGeneration: p.Generation, + Replicas: 1, + UpdatedReplicas: 1, + ReadyReplicas: 0, + AvailableReplicas: 0, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressDeadlineExceeded", + }, + { + Type: appsv1.DeploymentAvailable, + Status: corev1.ConditionFalse, + Reason: "MinimumReplicasUnavailable", + LastUpdateTime: metav1.Now(), + }, + }, + } + + _, err = f.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), p, metav1.UpdateOptions{}) + require.NoError(t, err) +} + func newDeploymentFixture(c *flaggerv1.Canary) fixture { if c == nil { c = newDeploymentTestCanary() diff --git a/pkg/controller/scheduler_deployment_test.go b/pkg/controller/scheduler_deployment_test.go index 68ec7a0db..9e7c57040 100644 --- a/pkg/controller/scheduler_deployment_test.go +++ b/pkg/controller/scheduler_deployment_test.go @@ -116,6 +116,67 @@ func TestScheduler_DeploymentRollback(t *testing.T) { assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase) } +// when the primary fails to become ready during promotion, the healthy canary +// must be kept instead of rolled back to the broken primary (#1898) +func TestScheduler_DeploymentPromotionPrimaryNotReady(t *testing.T) { + mocks := newDeploymentFixture(nil) + + // initializing + mocks.ctrl.advanceCanary("podinfo", "default") + mocks.makePrimaryReady(t) + + // initialized + mocks.ctrl.advanceCanary("podinfo", "default") + + // update + dep2 := newDeploymentTestDeploymentV2() + _, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{}) + require.NoError(t, err) + + // detect changes -> progressing, canary scaled up + mocks.ctrl.advanceCanary("podinfo", "default") + mocks.makeCanaryReady(t) + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing)) + + canaryDep, err := mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + require.NotNil(t, canaryDep.Spec.Replicas) + canaryReplicas := *canaryDep.Spec.Replicas + require.Greater(t, canaryReplicas, int32(0)) + + // simulate: analysis succeeded, spec promoted to primary, now finishing promotion + cd, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + err = mocks.deployer.SetStatusPhase(cd, flaggerv1.CanaryPhasePromoting) + require.NoError(t, err) + + // known routing state at promotion time (split between primary and canary) + require.NoError(t, mocks.router.SetRoutes(mocks.canary, 50, 50, false)) + + // the promoted primary fails to roll out + mocks.makePrimaryNotReady(t) + + // advance: Flagger observes the primary is stuck + mocks.ctrl.advanceCanary("podinfo", "default") + + // the canary must NOT be scaled to zero - it is the only healthy copy serving traffic + canaryDep, err = mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + require.NotNil(t, canaryDep.Spec.Replicas) + assert.Equal(t, canaryReplicas, *canaryDep.Spec.Replicas, "canary must not be scaled to zero when promotion fails") + + // traffic must NOT be shifted entirely onto the broken primary + primaryWeight, canaryWeight, _, err := mocks.router.GetRoutes(mocks.canary) + require.NoError(t, err) + assert.NotEqual(t, 100, primaryWeight, "traffic must not be routed entirely to the unhealthy primary") + assert.Greater(t, canaryWeight, 0, "canary must keep serving traffic when promotion fails") + + // the rollout is reported as failed so it stops advancing and alerts + c, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase) +} + func TestScheduler_DeploymentSkipAnalysis(t *testing.T) { mocks := newDeploymentFixture(nil) // initializing