Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,14 @@ func (c *Controller) advanceCanary(name string, namespace string) {
if err != nil {
c.recordEventWarningf(cd, "%v", err)
if !retriable {
c.rollback(cd, canaryController, meshRouter, scalerReconciler)
// during promotion the canary is the only healthy copy, halt
// instead of rolling back traffic to the unhealthy primary
if cd.Status.Phase == flaggerv1.CanaryPhasePromoting ||
cd.Status.Phase == flaggerv1.CanaryPhaseFinalising {
c.promotionFailed(cd, canaryController, err)
} else {
c.rollback(cd, canaryController, meshRouter, scalerReconciler)
}
}
return
}
Expand Down Expand Up @@ -989,6 +996,29 @@ func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary.
c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed)
}

// promotionFailed marks the rollout as failed when the primary is unhealthy
// during promotion, without scaling the canary down or routing to the primary.
func (c *Controller) promotionFailed(canary *flaggerv1.Canary, canaryController canary.Controller, err error) {
c.recordEventWarningf(canary, "Promotion of %s.%s failed, primary not ready: %v",
canary.Spec.TargetRef.Name, canary.Namespace, err)
c.alert(canary, fmt.Sprintf("Promotion failed, primary not ready: %v", err),
false, flaggerv1.SeverityError)

if err := canaryController.SetStatusPhase(canary, flaggerv1.CanaryPhaseFailed); err != nil {
c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err)
return
}

c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed)
c.recorder.IncFailures(metrics.CanaryMetricLabels{
Name: canary.Spec.TargetRef.Name,
Namespace: canary.Namespace,
DeploymentStrategy: canary.DeploymentStrategy(),
AnalysisStatus: metrics.AnalysisStatusCompleted,
})
c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed)
}

func (c *Controller) setPhaseInitialized(cd *flaggerv1.Canary, canaryController canary.Controller) error {
if cd.Status.Phase == "" || cd.Status.Phase == flaggerv1.CanaryPhaseInitializing {
cd.Status.Phase = flaggerv1.CanaryPhaseInitialized
Expand Down
33 changes: 33 additions & 0 deletions pkg/controller/scheduler_deployment_fixture_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,39 @@ func (f fixture) makeReady(t *testing.T, name string) {
require.NoError(t, err)
}

// makePrimaryNotReady puts the primary into a stuck rollout (progress deadline exceeded).
func (f fixture) makePrimaryNotReady(t *testing.T) {
primaryName := fmt.Sprintf("%s-primary", f.canary.Spec.TargetRef.Name)
p, err := f.kubeClient.AppsV1().
Deployments("default").
Get(context.TODO(), primaryName, metav1.GetOptions{})
require.NoError(t, err)

p.Status = appsv1.DeploymentStatus{
ObservedGeneration: p.Generation,
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: 0,
AvailableReplicas: 0,
Conditions: []appsv1.DeploymentCondition{
{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressDeadlineExceeded",
},
{
Type: appsv1.DeploymentAvailable,
Status: corev1.ConditionFalse,
Reason: "MinimumReplicasUnavailable",
LastUpdateTime: metav1.Now(),
},
},
}

_, err = f.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), p, metav1.UpdateOptions{})
require.NoError(t, err)
}

func newDeploymentFixture(c *flaggerv1.Canary) fixture {
if c == nil {
c = newDeploymentTestCanary()
Expand Down
61 changes: 61 additions & 0 deletions pkg/controller/scheduler_deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,67 @@ func TestScheduler_DeploymentRollback(t *testing.T) {
assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase)
}

// when the primary fails to become ready during promotion, the healthy canary
// must be kept instead of rolled back to the broken primary (#1898)
func TestScheduler_DeploymentPromotionPrimaryNotReady(t *testing.T) {
mocks := newDeploymentFixture(nil)

// initializing
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makePrimaryReady(t)

// initialized
mocks.ctrl.advanceCanary("podinfo", "default")

// update
dep2 := newDeploymentTestDeploymentV2()
_, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{})
require.NoError(t, err)

// detect changes -> progressing, canary scaled up
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makeCanaryReady(t)
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing))

canaryDep, err := mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
canaryReplicas := *canaryDep.Spec.Replicas
require.Greater(t, canaryReplicas, int32(0))

// simulate: analysis succeeded, spec promoted to primary, now finishing promotion
cd, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
err = mocks.deployer.SetStatusPhase(cd, flaggerv1.CanaryPhasePromoting)
require.NoError(t, err)

// known routing state at promotion time (split between primary and canary)
require.NoError(t, mocks.router.SetRoutes(mocks.canary, 50, 50, false))

// the promoted primary fails to roll out
mocks.makePrimaryNotReady(t)

// advance: Flagger observes the primary is stuck
mocks.ctrl.advanceCanary("podinfo", "default")

// the canary must NOT be scaled to zero - it is the only healthy copy serving traffic
canaryDep, err = mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
assert.Equal(t, canaryReplicas, *canaryDep.Spec.Replicas, "canary must not be scaled to zero when promotion fails")

// traffic must NOT be shifted entirely onto the broken primary
primaryWeight, canaryWeight, _, err := mocks.router.GetRoutes(mocks.canary)
require.NoError(t, err)
assert.NotEqual(t, 100, primaryWeight, "traffic must not be routed entirely to the unhealthy primary")
assert.Greater(t, canaryWeight, 0, "canary must keep serving traffic when promotion fails")

// the rollout is reported as failed so it stops advancing and alerts
c, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase)
}

func TestScheduler_DeploymentSkipAnalysis(t *testing.T) {
mocks := newDeploymentFixture(nil)
// initializing
Expand Down