Skip to content

Commit e5fb7ab

Browse files
SinghVikram97vbedi
andauthored
Exponential backoff for reconciler requests (#822)
* exponential backoff for reconciler requests * fix typo in comments * fix: use static delay for target draining instead of exponential backoff to prevent delay in reconciliation --------- Co-authored-by: vbedi <vbedi@amazon.com>
1 parent 17be840 commit e5fb7ab

17 files changed

+122
-92
lines changed

pkg/controllers/accesslogpolicy_controller.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ package controllers
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"reflect"
2324

2425
"golang.org/x/exp/slices"
2526
corev1 "k8s.io/api/core/v1"
26-
"k8s.io/apimachinery/pkg/api/errors"
27+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829
"k8s.io/apimachinery/pkg/runtime"
2930
"k8s.io/apimachinery/pkg/types"
@@ -116,8 +117,8 @@ func (r *accessLogPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Requ
116117
res, retryErr := lattice_runtime.HandleReconcileError(recErr)
117118
if res.RequeueAfter != 0 {
118119
r.log.Infow(ctx, "requeue request", "name", req.Name, "requeueAfter", res.RequeueAfter)
119-
} else if res.RequeueAfter == 0 && retryErr != nil {
120-
r.log.Infow(ctx, "requeue request", "name", req.Name)
120+
} else if retryErr != nil && !errors.Is(retryErr, reconcile.TerminalError(nil)) {
121+
r.log.Infow(ctx, "requeue request using exponential backoff", "name", req.Name)
121122
} else if retryErr == nil {
122123
r.log.Infow(ctx, "reconciled", "name", req.Name)
123124
}
@@ -250,7 +251,7 @@ func (r *accessLogPolicyReconciler) targetRefExists(ctx context.Context, alp *an
250251
return false, fmt.Errorf("access Log Policy targetRef is for unsupported Kind: %s", alp.Spec.TargetRef.Kind)
251252
}
252253

253-
if err != nil && !errors.IsNotFound(err) {
254+
if err != nil && !apierrors.IsNotFound(err) {
254255
return false, err
255256
}
256257

pkg/controllers/gateway_controller.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controllers
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223

2324
anv1alpha1 "github.com/aws/aws-application-networking-k8s/pkg/apis/applicationnetworking/v1alpha1"
@@ -32,13 +33,14 @@ import (
3233
"github.com/aws/aws-application-networking-k8s/pkg/utils"
3334
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
3435

35-
"github.com/pkg/errors"
36+
pkgerrors "github.com/pkg/errors"
3637
corev1 "k8s.io/api/core/v1"
3738
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3839
"k8s.io/apimachinery/pkg/runtime"
3940
"k8s.io/client-go/tools/record"
4041
ctrl "sigs.k8s.io/controller-runtime"
4142
"sigs.k8s.io/controller-runtime/pkg/client"
43+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
4244
gwv1 "sigs.k8s.io/gateway-api/apis/v1"
4345

4446
deploy "github.com/aws/aws-application-networking-k8s/pkg/deploy/lattice"
@@ -130,8 +132,8 @@ func (r *gatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
130132
res, retryErr := lattice_runtime.HandleReconcileError(recErr)
131133
if res.RequeueAfter != 0 {
132134
r.log.Infow(ctx, "requeue request", "name", req.Name, "requeueAfter", res.RequeueAfter)
133-
} else if res.RequeueAfter == 0 && retryErr != nil {
134-
r.log.Infow(ctx, "requeue request", "name", req.Name)
135+
} else if retryErr != nil && !errors.Is(retryErr, reconcile.TerminalError(nil)) {
136+
r.log.Infow(ctx, "requeue request using exponential backoff", "name", req.Name)
135137
} else if retryErr == nil {
136138
r.log.Infow(ctx, "reconciled", "name", req.Name)
137139
}
@@ -194,7 +196,7 @@ func (r *gatewayReconciler) reconcileUpsert(ctx context.Context, gw *gwv1.Gatewa
194196
if err != nil {
195197
err2 := r.updateGatewayAcceptStatus(ctx, gw, false)
196198
if err2 != nil {
197-
return errors.Wrap(err2, err.Error())
199+
return pkgerrors.Wrap(err2, err.Error())
198200
}
199201
}
200202

@@ -420,7 +422,7 @@ func UpdateGWListenerStatus(ctx context.Context, k8sClient client.Client, gw *gw
420422
}
421423

422424
if err := k8sClient.Status().Patch(ctx, gw, client.MergeFrom(gwOld)); err != nil {
423-
return errors.Wrapf(err, "listener update failed")
425+
return pkgerrors.Wrapf(err, "listener update failed")
424426
}
425427

426428
if hasValidListener {

pkg/controllers/route_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ import (
4646
"github.com/aws/aws-application-networking-k8s/pkg/controllers/eventhandlers"
4747
"github.com/aws/aws-application-networking-k8s/pkg/controllers/predicates"
4848
"github.com/aws/aws-application-networking-k8s/pkg/deploy"
49-
"github.com/aws/aws-application-networking-k8s/pkg/deploy/lattice"
5049
"github.com/aws/aws-application-networking-k8s/pkg/gateway"
5150
"github.com/aws/aws-application-networking-k8s/pkg/k8s"
5251
"github.com/aws/aws-application-networking-k8s/pkg/model/core"
@@ -262,7 +261,8 @@ func (r *routeReconciler) buildAndDeployModel(
262261
r.log.Debugf(ctx, "stack: %s", json)
263262

264263
if err := r.stackDeployer.Deploy(ctx, stack); err != nil {
265-
if errors.As(err, &lattice.RetryErr) {
264+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
265+
if errors.As(err, &requeueNeededAfter) {
266266
r.eventRecorder.Event(route.K8sObject(), corev1.EventTypeNormal,
267267
k8s.RouteEventReasonRetryReconcile, "retry reconcile...")
268268
} else {

pkg/controllers/serviceexport_controller.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"k8s.io/client-go/tools/record"
2828
ctrl "sigs.k8s.io/controller-runtime"
2929
"sigs.k8s.io/controller-runtime/pkg/client"
30+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3031

3132
anv1alpha1 "github.com/aws/aws-application-networking-k8s/pkg/apis/applicationnetworking/v1alpha1"
3233
"github.com/aws/aws-application-networking-k8s/pkg/aws"
@@ -111,7 +112,16 @@ func (r *serviceExportReconciler) Reconcile(ctx context.Context, req ctrl.Reques
111112
if recErr != nil {
112113
r.log.Infow(ctx, "reconcile error", "name", req.Name, "message", recErr.Error())
113114
}
114-
return lattice_runtime.HandleReconcileError(recErr)
115+
116+
res, retryErr := lattice_runtime.HandleReconcileError(recErr)
117+
if res.RequeueAfter != 0 {
118+
r.log.Infow(ctx, "requeue request", "name", req.Name, "requeueAfter", res.RequeueAfter)
119+
} else if retryErr != nil && !errors.Is(retryErr, reconcile.TerminalError(nil)) {
120+
r.log.Infow(ctx, "requeue request using exponential backoff", "name", req.Name)
121+
} else if retryErr == nil {
122+
r.log.Infow(ctx, "reconciled", "name", req.Name)
123+
}
124+
return res, retryErr
115125
}
116126

117127
func (r *serviceExportReconciler) reconcile(ctx context.Context, req ctrl.Request) error {

pkg/deploy/lattice/error.go

Lines changed: 0 additions & 9 deletions
This file was deleted.

pkg/deploy/lattice/service_manager.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66

77
"github.com/aws/aws-application-networking-k8s/pkg/aws/services"
8+
lattice_runtime "github.com/aws/aws-application-networking-k8s/pkg/runtime"
89
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
910

1011
"github.com/aws/aws-sdk-go/aws"
@@ -251,12 +252,12 @@ func (m *defaultServiceManager) updateAssociations(ctx context.Context, svc *Ser
251252
return nil
252253
}
253254

254-
// returns RetryErr on all non-active Sn-Svc association responses
255+
// returns RetryError on all non-active Sn-Svc association responses
255256
func handleCreateAssociationResp(resp *CreateSnSvcAssocResp) error {
256257
status := aws.StringValue(resp.Status)
257258
if status != vpclattice.ServiceNetworkServiceAssociationStatusActive {
258259
return fmt.Errorf("%w: sn-service-association-id: %s, non-active status: %s",
259-
RetryErr, aws.StringValue(resp.Id), status)
260+
lattice_runtime.NewRetryError(), aws.StringValue(resp.Id), status)
260261
}
261262
return nil
262263
}
@@ -288,7 +289,7 @@ func associationsDiff(svc *Service, curAssocs []*SnSvcAssocSummary) ([]string, [
288289
// TODO: we should have something more lightweight, retrying full reconciliation looks to heavy
289290
if aws.StringValue(oldSn.Status) == vpclattice.ServiceNetworkServiceAssociationStatusDeleteInProgress {
290291
return nil, nil, fmt.Errorf("%w: want to associate sn: %s to svc: %s, but status is: %s",
291-
RetryErr, newSn, svc.LatticeServiceName(), *oldSn.Status)
292+
lattice_runtime.NewRetryError(), newSn, svc.LatticeServiceName(), *oldSn.Status)
292293
}
293294
// TODO: if assoc in failed state, may be we should try to re-create?
294295
}

pkg/deploy/lattice/service_manager_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
mocks "github.com/aws/aws-application-networking-k8s/pkg/aws/services"
1010
"github.com/aws/aws-application-networking-k8s/pkg/model/core"
1111
model "github.com/aws/aws-application-networking-k8s/pkg/model/lattice"
12+
lattice_runtime "github.com/aws/aws-application-networking-k8s/pkg/runtime"
1213
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
1314
"github.com/aws/aws-sdk-go/aws"
1415
"github.com/aws/aws-sdk-go/service/vpclattice"
@@ -508,7 +509,8 @@ func TestHandleSnSvcAssocResp(t *testing.T) {
508509
Status: aws.String(vpclattice.ServiceNetworkServiceAssociationStatusCreateInProgress),
509510
}
510511
err := handleCreateAssociationResp(resp)
511-
assert.True(t, errors.Is(err, RetryErr))
512+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
513+
assert.True(t, errors.As(err, &requeueNeededAfter))
512514
})
513515

514516
}
@@ -569,7 +571,8 @@ func TestSnSvcAssocsDiff(t *testing.T) {
569571
Status: aws.String(vpclattice.ServiceNetworkServiceAssociationStatusDeleteInProgress),
570572
}}
571573
_, _, err := associationsDiff(svc, assocs)
572-
assert.True(t, errors.Is(err, RetryErr))
574+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
575+
assert.True(t, errors.As(err, &requeueNeededAfter))
573576
})
574577

575578
}

pkg/deploy/lattice/service_network_manager.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ package lattice
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
76

87
"golang.org/x/exp/slices"
98

109
"github.com/aws/aws-application-networking-k8s/pkg/aws/services"
10+
lattice_runtime "github.com/aws/aws-application-networking-k8s/pkg/runtime"
1111
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
1212

1313
"github.com/aws/aws-sdk-go/aws"
@@ -80,7 +80,7 @@ func (m *defaultServiceNetworkManager) UpsertVpcAssociation(ctx context.Context,
8080
case vpclattice.ServiceNetworkVpcAssociationStatusActive:
8181
return *resp.Arn, nil
8282
default:
83-
return *resp.Arn, fmt.Errorf("%w, vpc association status in %s", RetryErr, status)
83+
return *resp.Arn, fmt.Errorf("%w, vpc association status in %s", lattice_runtime.NewRetryError(), status)
8484
}
8585
}
8686
}
@@ -125,7 +125,7 @@ func (m *defaultServiceNetworkManager) DeleteVpcAssociation(ctx context.Context,
125125
if err != nil {
126126
m.log.Infof(ctx, "Failed to delete association %s for %s, with response %s and err %s", *snva.Arn, snName, resp, err.Error())
127127
}
128-
return errors.New(LATTICE_RETRY)
128+
return lattice_runtime.NewRetryError()
129129
}
130130
return nil
131131
}
@@ -163,7 +163,7 @@ func (m *defaultServiceNetworkManager) getActiveVpcAssociation(ctx context.Conte
163163
return nil, nil
164164
default:
165165
// a mutation is in progress, try later
166-
return nil, errors.New(LATTICE_RETRY)
166+
return nil, lattice_runtime.NewRetryError()
167167
}
168168
}
169169

@@ -253,7 +253,7 @@ func (m *defaultServiceNetworkManager) updateServiceNetworkVpcAssociation(ctx co
253253
SnvaSecurityGroupIds: updateSnvaResp.SecurityGroupIds,
254254
}, nil
255255
} else {
256-
return model.ServiceNetworkStatus{}, fmt.Errorf("%w, update snva status: %s", RetryErr, *updateSnvaResp.Status)
256+
return model.ServiceNetworkStatus{}, fmt.Errorf("%w, update snva status: %s", lattice_runtime.NewRetryError(), *updateSnvaResp.Status)
257257
}
258258
}
259259

pkg/deploy/lattice/service_network_manager_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/stretchr/testify/assert"
1212

1313
"github.com/aws/aws-application-networking-k8s/pkg/config"
14+
lattice_runtime "github.com/aws/aws-application-networking-k8s/pkg/runtime"
1415
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
1516

1617
pkg_aws "github.com/aws/aws-application-networking-k8s/pkg/aws"
@@ -147,7 +148,8 @@ func Test_CreateOrUpdateServiceNetwork_SnAlreadyExist_ServiceNetworkVpcAssociati
147148
resp, err := snMgr.CreateOrUpdate(ctx, &snCreateInput)
148149

149150
assert.NotNil(t, err)
150-
assert.Equal(t, err, errors.New(LATTICE_RETRY))
151+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
152+
assert.True(t, errors.As(err, &requeueNeededAfter))
151153
assert.Equal(t, resp.ServiceNetworkARN, "")
152154
assert.Equal(t, resp.ServiceNetworkID, "")
153155
}
@@ -199,7 +201,8 @@ func Test_CreateOrUpdateServiceNetwork_SnAlreadyExist_ServiceNetworkVpcAssociati
199201
resp, err := snMgr.CreateOrUpdate(ctx, &snCreateInput)
200202

201203
assert.NotNil(t, err)
202-
assert.Equal(t, err, errors.New(LATTICE_RETRY))
204+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
205+
assert.True(t, errors.As(err, &requeueNeededAfter))
203206
assert.Equal(t, resp.ServiceNetworkARN, "")
204207
assert.Equal(t, resp.ServiceNetworkID, "")
205208
}
@@ -629,7 +632,8 @@ func Test_defaultServiceNetworkManager_CreateOrUpdate_SnExists_SnvaCreateInProgr
629632
snMgr := NewDefaultServiceNetworkManager(gwlog.FallbackLogger, cloud)
630633
_, err := snMgr.UpsertVpcAssociation(ctx, name, securityGroupIds)
631634

632-
assert.Equal(t, err, errors.New(LATTICE_RETRY))
635+
var requeueNeededAfter *lattice_runtime.RequeueNeededAfter
636+
assert.True(t, errors.As(err, &requeueNeededAfter))
633637
}
634638

635639
func Test_defaultServiceNetworkManager_CreateOrUpdate_SnExists_SnvaExists_CannotUpdateSecurityGroupsFromNonemptyToEmpty(t *testing.T) {

pkg/deploy/lattice/target_group_manager.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717

1818
pkg_aws "github.com/aws/aws-application-networking-k8s/pkg/aws"
1919
model "github.com/aws/aws-application-networking-k8s/pkg/model/lattice"
20+
lattice_runtime "github.com/aws/aws-application-networking-k8s/pkg/runtime"
2021
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
2122
)
2223

@@ -115,7 +116,7 @@ func (s *defaultTargetGroupManager) create(ctx context.Context, modelTg *model.T
115116
latticeTgStatus != vpclattice.TargetGroupStatusCreateInProgress {
116117

117118
s.log.Infof(ctx, "Target group is not in the desired state. State is %s, will retry", latticeTgStatus)
118-
return model.TargetGroupStatus{}, errors.New(LATTICE_RETRY)
119+
return model.TargetGroupStatus{}, lattice_runtime.NewRetryError()
119120
}
120121

121122
// create-in-progress is considered success
@@ -219,7 +220,7 @@ func (s *defaultTargetGroupManager) Delete(ctx context.Context, modelTg *model.T
219220

220221
if drainCount > 0 {
221222
// no point in trying to deregister may as well wait
222-
return fmt.Errorf("cannot deregister targets for %s as %d targets are DRAINING", modelTg.Status.Id, drainCount)
223+
return fmt.Errorf("%w: cannot deregister targets for %s as %d targets are DRAINING", lattice_runtime.NewRetryError(), modelTg.Status.Id, drainCount)
223224
}
224225

225226
if len(targetsToDeregister) > 0 {
@@ -341,7 +342,7 @@ func (s *defaultTargetGroupManager) findTargetGroup(
341342
if match {
342343
switch status {
343344
case vpclattice.TargetGroupStatusCreateInProgress, vpclattice.TargetGroupStatusDeleteInProgress:
344-
return nil, errors.New(LATTICE_RETRY)
345+
return nil, lattice_runtime.NewRetryError()
345346
case vpclattice.TargetGroupStatusDeleteFailed, vpclattice.TargetGroupStatusActive:
346347
return latticeTg, nil
347348
}

0 commit comments

Comments
 (0)