diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 12a94befc..15904a3a4 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -297,7 +297,14 @@ func (c *Controller) advanceCanary(name string, namespace string) { if err != nil { c.recordEventWarningf(cd, "%v", err) if !retriable { - c.rollback(cd, canaryController, meshRouter, scalerReconciler) + // during promotion the canary is the only healthy copy, halt + // instead of rolling back traffic to the unhealthy primary + if cd.Status.Phase == flaggerv1.CanaryPhasePromoting || + cd.Status.Phase == flaggerv1.CanaryPhaseFinalising { + c.promotionFailed(cd, canaryController, err) + } else { + c.rollback(cd, canaryController, meshRouter, scalerReconciler) + } } return } @@ -989,6 +996,29 @@ func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary. c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed) } +// promotionFailed marks the rollout as failed when the primary is unhealthy +// during promotion, without scaling the canary down or routing to the primary. +func (c *Controller) promotionFailed(canary *flaggerv1.Canary, canaryController canary.Controller, err error) { + c.recordEventWarningf(canary, "Promotion of %s.%s failed, primary not ready: %v", + canary.Spec.TargetRef.Name, canary.Namespace, err) + c.alert(canary, fmt.Sprintf("Promotion failed, primary not ready: %v", err), + false, flaggerv1.SeverityError) + + if err := canaryController.SetStatusPhase(canary, flaggerv1.CanaryPhaseFailed); err != nil { + c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err) + return + } + + c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed) + c.recorder.IncFailures(metrics.CanaryMetricLabels{ + Name: canary.Spec.TargetRef.Name, + Namespace: canary.Namespace, + DeploymentStrategy: canary.DeploymentStrategy(), + AnalysisStatus: metrics.AnalysisStatusCompleted, + }) + c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed) +} + func (c *Controller) setPhaseInitialized(cd *flaggerv1.Canary, canaryController canary.Controller) error { if cd.Status.Phase == "" || cd.Status.Phase == flaggerv1.CanaryPhaseInitializing { cd.Status.Phase = flaggerv1.CanaryPhaseInitialized diff --git a/pkg/controller/scheduler_deployment_fixture_test.go b/pkg/controller/scheduler_deployment_fixture_test.go index a4e046224..4174737bf 100644 --- a/pkg/controller/scheduler_deployment_fixture_test.go +++ b/pkg/controller/scheduler_deployment_fixture_test.go @@ -82,6 +82,39 @@ func (f fixture) makeReady(t *testing.T, name string) { require.NoError(t, err) } +// makePrimaryNotReady puts the primary into a stuck rollout (progress deadline exceeded). +func (f fixture) makePrimaryNotReady(t *testing.T) { + primaryName := fmt.Sprintf("%s-primary", f.canary.Spec.TargetRef.Name) + p, err := f.kubeClient.AppsV1(). + Deployments("default"). + Get(context.TODO(), primaryName, metav1.GetOptions{}) + require.NoError(t, err) + + p.Status = appsv1.DeploymentStatus{ + ObservedGeneration: p.Generation, + Replicas: 1, + UpdatedReplicas: 1, + ReadyReplicas: 0, + AvailableReplicas: 0, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressDeadlineExceeded", + }, + { + Type: appsv1.DeploymentAvailable, + Status: corev1.ConditionFalse, + Reason: "MinimumReplicasUnavailable", + LastUpdateTime: metav1.Now(), + }, + }, + } + + _, err = f.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), p, metav1.UpdateOptions{}) + require.NoError(t, err) +} + func newDeploymentFixture(c *flaggerv1.Canary) fixture { if c == nil { c = newDeploymentTestCanary() diff --git a/pkg/controller/scheduler_deployment_test.go b/pkg/controller/scheduler_deployment_test.go index 68ec7a0db..9e7c57040 100644 --- a/pkg/controller/scheduler_deployment_test.go +++ b/pkg/controller/scheduler_deployment_test.go @@ -116,6 +116,67 @@ func TestScheduler_DeploymentRollback(t *testing.T) { assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase) } +// when the primary fails to become ready during promotion, the healthy canary +// must be kept instead of rolled back to the broken primary (#1898) +func TestScheduler_DeploymentPromotionPrimaryNotReady(t *testing.T) { + mocks := newDeploymentFixture(nil) + + // initializing + mocks.ctrl.advanceCanary("podinfo", "default") + mocks.makePrimaryReady(t) + + // initialized + mocks.ctrl.advanceCanary("podinfo", "default") + + // update + dep2 := newDeploymentTestDeploymentV2() + _, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{}) + require.NoError(t, err) + + // detect changes -> progressing, canary scaled up + mocks.ctrl.advanceCanary("podinfo", "default") + mocks.makeCanaryReady(t) + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing)) + + canaryDep, err := mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + require.NotNil(t, canaryDep.Spec.Replicas) + canaryReplicas := *canaryDep.Spec.Replicas + require.Greater(t, canaryReplicas, int32(0)) + + // simulate: analysis succeeded, spec promoted to primary, now finishing promotion + cd, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + err = mocks.deployer.SetStatusPhase(cd, flaggerv1.CanaryPhasePromoting) + require.NoError(t, err) + + // known routing state at promotion time (split between primary and canary) + require.NoError(t, mocks.router.SetRoutes(mocks.canary, 50, 50, false)) + + // the promoted primary fails to roll out + mocks.makePrimaryNotReady(t) + + // advance: Flagger observes the primary is stuck + mocks.ctrl.advanceCanary("podinfo", "default") + + // the canary must NOT be scaled to zero - it is the only healthy copy serving traffic + canaryDep, err = mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + require.NotNil(t, canaryDep.Spec.Replicas) + assert.Equal(t, canaryReplicas, *canaryDep.Spec.Replicas, "canary must not be scaled to zero when promotion fails") + + // traffic must NOT be shifted entirely onto the broken primary + primaryWeight, canaryWeight, _, err := mocks.router.GetRoutes(mocks.canary) + require.NoError(t, err) + assert.NotEqual(t, 100, primaryWeight, "traffic must not be routed entirely to the unhealthy primary") + assert.Greater(t, canaryWeight, 0, "canary must keep serving traffic when promotion fails") + + // the rollout is reported as failed so it stops advancing and alerts + c, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase) +} + func TestScheduler_DeploymentSkipAnalysis(t *testing.T) { mocks := newDeploymentFixture(nil) // initializing