Skip to content

Commit ddc5de4

Browse files
committed
fix: decouple connection retry backoff from TCP dial timeout
KEDA_HTTP_CONNECT_TIMEOUT was incorrectly used as both the TCP dial timeout and the initial retry backoff duration. This caused cold start response times to scale linearly with the timeout value (e.g., 500ms timeout → 3.5s response, 5s timeout → 7.8s response). Changes: - Use fixed 50ms initial backoff duration for cold start polling - Adopt Knatives proven backoff strategy (factor 1.4, jitter 0.1) - Fix MinTotalBackoffDuration to calculate exponential (not linear) sum - Improve dial retry test coverage Signed-off-by: Vincent Link <vlink@redhat.com>
1 parent 72e2b36 commit ddc5de4

File tree

5 files changed

+114
-59
lines changed

5 files changed

+114
-59
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ This changelog keeps track of work items that have been completed and are ready
3535

3636
### Fixes
3737

38-
- **General**: TODO ([#TODO](https://github.com/kedacore/http-add-on/issues/TODO))
38+
- **Interceptor**: Decouple connection retry backoff from TCP dial timeout for faster cold starts ([#1385](https://github.com/kedacore/http-add-on/issues/1385))
3939

4040
### Deprecations
4141

interceptor/config/timeouts.go

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,18 @@ type Timeouts struct {
3636
ExpectContinueTimeout time.Duration `envconfig:"KEDA_HTTP_EXPECT_CONTINUE_TIMEOUT" default:"1s"`
3737
}
3838

39-
// Backoff returns a wait.Backoff based on the timeouts in t
40-
func (t *Timeouts) Backoff(factor, jitter float64, steps int) wait.Backoff {
39+
// DefaultBackoff returns backoff config optimized for cold start pod availability polling.
40+
// Based on https://github.com/knative/networking/blob/main/test/conformance/ingress/util.go#L70
41+
func (t Timeouts) DefaultBackoff() wait.Backoff {
4142
return wait.Backoff{
42-
Duration: t.Connect,
43-
Factor: factor,
44-
Jitter: jitter,
45-
Steps: steps,
43+
Cap: 10 * time.Second, // max sleep time per step
44+
Duration: 50 * time.Millisecond,
45+
Factor: 1.4,
46+
Jitter: 0.1, // adds up to +10% randomness
47+
Steps: 10,
4648
}
4749
}
4850

49-
// DefaultBackoff calls t.Backoff with reasonable defaults and returns
50-
// the result
51-
func (t Timeouts) DefaultBackoff() wait.Backoff {
52-
return t.Backoff(2, 0.5, 5)
53-
}
54-
5551
// Parse parses standard configs using envconfig and returns a pointer to the
5652
// newly created config. Returns nil and a non-nil error if parsing failed
5753
func MustParseTimeouts() *Timeouts {

interceptor/handler/upstream_test.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -298,8 +298,7 @@ func TestForwarderHeaderTimeout(t *testing.T) {
298298
timeouts := defaultTimeouts()
299299
timeouts.Connect = 1 * time.Millisecond
300300
timeouts.ResponseHeader = 1 * time.Millisecond
301-
backoff := timeouts.Backoff(2, 2, 1)
302-
dialCtxFunc := retryDialContextFunc(timeouts, backoff)
301+
dialCtxFunc := retryDialContextFunc(timeouts, timeouts.DefaultBackoff())
303302
res, req, err := reqAndRes("/testfwd")
304303
r.NoError(err)
305304
req = util.RequestWithStream(req, originURL)
@@ -387,8 +386,7 @@ func TestForwarderConnectionRetryAndTimeout(t *testing.T) {
387386
// forwardDoneSignal should close _after_ the total timeout of forwardRequest.
388387
//
389388
// forwardRequest uses dialCtxFunc to establish network connections, and dialCtxFunc does
390-
// exponential backoff. It starts at 2ms (timeouts.Connect above), doubles every time, and stops after 5 tries,
391-
// so that's 2ms + 4ms + 8ms + 16ms + 32ms, or SUM(2^N) where N is in [1, 5]
389+
// exponential backoff.
392390
expectedForwardTimeout := kedanet.MinTotalBackoffDuration(timeouts.DefaultBackoff())
393391
r.GreaterOrEqualf(
394392
elapsed,
@@ -427,8 +425,7 @@ func TestForwardRequestRedirectAndHeaders(t *testing.T) {
427425
timeouts := defaultTimeouts()
428426
timeouts.Connect = 10 * time.Millisecond
429427
timeouts.ResponseHeader = 10 * time.Millisecond
430-
backoff := timeouts.Backoff(2, 2, 1)
431-
dialCtxFunc := retryDialContextFunc(timeouts, backoff)
428+
dialCtxFunc := retryDialContextFunc(timeouts, timeouts.DefaultBackoff())
432429
res, req, err := reqAndRes("/testfwd")
433430
r.NoError(err)
434431
req = util.RequestWithStream(req, srvURL)

pkg/net/backoff.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@ import (
66
"k8s.io/apimachinery/pkg/util/wait"
77
)
88

9-
// MinTotalBackoffDuration returns the minimum duration that backoff
10-
// would wait, including all steps, not including any jitter
9+
// MinTotalBackoffDuration returns the minimum total sleep duration across all retry steps.
1110
func MinTotalBackoffDuration(backoff wait.Backoff) time.Duration {
12-
initial := backoff.Duration.Milliseconds()
13-
retMS := backoff.Duration.Milliseconds()
14-
numSteps := backoff.Steps
15-
for i := 2; i <= numSteps; i++ {
16-
retMS += initial * int64(i)
11+
var total time.Duration
12+
duration := backoff.Duration
13+
for range backoff.Steps {
14+
total += duration
15+
duration = time.Duration(float64(duration) * backoff.Factor)
1716
}
18-
return time.Duration(retMS) * time.Millisecond
17+
return total
1918
}

pkg/net/dial_context_test.go

Lines changed: 95 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,107 @@ package net
22

33
import (
44
"context"
5+
"net"
6+
"net/http"
57
"testing"
68
"time"
79

810
"github.com/stretchr/testify/require"
911
"k8s.io/apimachinery/pkg/util/wait"
1012
)
1113

14+
// getUnreachableAddr returns an address that is guaranteed to be unreachable
15+
// by allocating an available port and immediately closing it
16+
func getUnreachableAddr(t *testing.T) string {
17+
t.Helper()
18+
listener, err := net.Listen("tcp", "localhost:0")
19+
require.NoError(t, err)
20+
addr := listener.Addr().String()
21+
listener.Close()
22+
return addr
23+
}
24+
1225
func TestDialContextWithRetry(t *testing.T) {
13-
r := require.New(t)
14-
15-
const (
16-
connTimeout = 10 * time.Millisecond
17-
keepAlive = 10 * time.Millisecond
18-
network = "tcp"
19-
addr = "localhost:60001"
20-
)
21-
backoff := wait.Backoff{
22-
Duration: connTimeout,
23-
Factor: 2,
24-
Jitter: 0.5,
25-
Steps: 5,
26-
}
27-
28-
ctx := context.Background()
29-
dialer := NewNetDialer(connTimeout, keepAlive)
30-
dRetry := DialContextWithRetry(dialer, backoff)
31-
minTotalWaitDur := MinTotalBackoffDuration(backoff)
32-
33-
start := time.Now()
34-
_, err := dRetry(ctx, network, addr)
35-
r.Error(err, "error was not found")
36-
37-
elapsed := time.Since(start)
38-
r.GreaterOrEqual(
39-
elapsed,
40-
minTotalWaitDur,
41-
"total elapsed (%s) was not >= than the minimum expected (%s)",
42-
elapsed,
43-
minTotalWaitDur,
44-
)
26+
t.Run("retries with exponential backoff on connection failure", func(t *testing.T) {
27+
r := require.New(t)
28+
29+
backoff := wait.Backoff{
30+
Duration: 10 * time.Millisecond,
31+
Factor: 2,
32+
Jitter: 0.1,
33+
Steps: 3,
34+
}
35+
36+
dialer := NewNetDialer(5*time.Millisecond, 10*time.Millisecond)
37+
dialRetry := DialContextWithRetry(dialer, backoff)
38+
39+
start := time.Now()
40+
_, err := dialRetry(context.Background(), "tcp", getUnreachableAddr(t))
41+
elapsed := time.Since(start)
42+
43+
r.Error(err, "should fail when connecting to unreachable address")
44+
45+
// Verify backoff was applied by checking we took at least the minimum backoff time
46+
minExpected := MinTotalBackoffDuration(backoff)
47+
r.GreaterOrEqual(elapsed, minExpected, "should take at least minimum backoff duration")
48+
})
49+
50+
t.Run("succeeds immediately when connection available", func(t *testing.T) {
51+
r := require.New(t)
52+
53+
backoff := wait.Backoff{
54+
Duration: 50 * time.Millisecond,
55+
Factor: 2,
56+
Jitter: 0.1,
57+
Steps: 5,
58+
}
59+
60+
srv, srvURL, err := StartTestServer(NewTestHTTPHandlerWrapper(
61+
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
62+
w.WriteHeader(http.StatusOK)
63+
}),
64+
))
65+
r.NoError(err)
66+
defer srv.Close()
67+
68+
dialer := NewNetDialer(100*time.Millisecond, 1*time.Second)
69+
dialRetry := DialContextWithRetry(dialer, backoff)
70+
71+
start := time.Now()
72+
conn, err := dialRetry(context.Background(), "tcp", srvURL.Host)
73+
elapsed := time.Since(start)
74+
75+
r.NoError(err)
76+
r.NotNil(conn)
77+
if conn != nil {
78+
conn.Close()
79+
}
80+
81+
r.Less(elapsed, backoff.Duration)
82+
})
83+
84+
t.Run("respects context cancellation", func(t *testing.T) {
85+
r := require.New(t)
86+
87+
backoff := wait.Backoff{
88+
Duration: 100 * time.Millisecond,
89+
Factor: 2,
90+
Jitter: 0.1,
91+
Steps: 5,
92+
}
93+
94+
dialer := NewNetDialer(10*time.Millisecond, 1*time.Second)
95+
dialRetry := DialContextWithRetry(dialer, backoff)
96+
97+
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
98+
defer cancel()
99+
100+
start := time.Now()
101+
_, err := dialRetry(ctx, "tcp", getUnreachableAddr(t))
102+
elapsed := time.Since(start)
103+
104+
r.Error(err)
105+
r.Contains(err.Error(), "context")
106+
r.Less(elapsed, MinTotalBackoffDuration(backoff))
107+
})
45108
}

0 commit comments

Comments
 (0)