Skip to content

Commit 6ef8a8a

Browse files
committed
Harden network and restore test retries
1 parent 16805f2 commit 6ef8a8a

File tree

4 files changed

+108
-63
lines changed

4 files changed

+108
-63
lines changed

lib/instances/compression_integration_linux_test.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -258,30 +258,64 @@ func waitForRunningAndExecReady(t *testing.T, ctx context.Context, mgr *manager,
258258
require.NoError(t, waitHypervisorUp(ctx, inst))
259259
}
260260
require.NoError(t, waitForExecAgent(ctx, mgr, instanceID, 30*time.Second))
261+
waitForGuestExecReady(t, ctx, inst)
261262
return inst
262263
}
263264

264-
func writeGuestMarker(t *testing.T, ctx context.Context, inst *Instance, path string, value string) {
265+
func waitForGuestExecReady(t *testing.T, ctx context.Context, inst *Instance) {
265266
t.Helper()
266-
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(compressionGuestExecTimeout))
267-
defer cancel()
268267

269-
output, exitCode, err := execCommand(execCtx, inst, "sh", "-c", fmt.Sprintf("printf %q > %s && sync", value, path))
268+
require.Eventually(t, func() bool {
269+
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(5*time.Second))
270+
defer cancel()
271+
272+
output, exitCode, err := execCommand(execCtx, inst, "true")
273+
return err == nil && exitCode == 0 && output == ""
274+
}, integrationTestTimeout(15*time.Second), 500*time.Millisecond, "guest exec should succeed after restore")
275+
}
276+
277+
func writeGuestMarker(t *testing.T, ctx context.Context, inst *Instance, path string, value string) {
278+
t.Helper()
279+
output, exitCode, err := execCommandWithRetry(ctx, inst, compressionGuestExecTimeout, "sh", "-c", fmt.Sprintf("printf %q > %s && sync", value, path))
270280
require.NoError(t, err)
271281
require.Equal(t, 0, exitCode, output)
272282
}
273283

274284
func assertGuestMarker(t *testing.T, ctx context.Context, inst *Instance, path string, expected string) {
275285
t.Helper()
276-
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(compressionGuestExecTimeout))
277-
defer cancel()
278-
279-
output, exitCode, err := execCommand(execCtx, inst, "cat", path)
286+
output, exitCode, err := execCommandWithRetry(ctx, inst, compressionGuestExecTimeout, "cat", path)
280287
require.NoError(t, err)
281288
require.Equal(t, 0, exitCode, output)
282289
assert.Equal(t, expected, output)
283290
}
284291

292+
func execCommandWithRetry(ctx context.Context, inst *Instance, timeout time.Duration, command ...string) (string, int, error) {
293+
deadline := time.Now().Add(integrationTestTimeout(timeout))
294+
var lastOutput string
295+
var lastExitCode int
296+
var lastErr error
297+
298+
for {
299+
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(5*time.Second))
300+
output, exitCode, err := execCommand(execCtx, inst, command...)
301+
cancel()
302+
303+
if err == nil {
304+
return output, exitCode, nil
305+
}
306+
307+
lastOutput = output
308+
lastExitCode = exitCode
309+
lastErr = err
310+
311+
if time.Now().After(deadline) {
312+
return lastOutput, lastExitCode, lastErr
313+
}
314+
315+
time.Sleep(500 * time.Millisecond)
316+
}
317+
}
318+
285319
func waitForCompressionJobStart(t *testing.T, mgr *manager, key string, timeout time.Duration) {
286320
t.Helper()
287321
deadline := time.Now().Add(timeout)

lib/instances/network_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,42 +245,42 @@ func TestDockerForwardChainRestored(t *testing.T) {
245245
require.NoError(t, manager.networkManager.Initialize(ctx, nil))
246246

247247
// Check if DOCKER-FORWARD chain exists (Docker must be running on host).
248-
checkChain := exec.Command("iptables", "-L", "DOCKER-FORWARD", "-n")
248+
checkChain := exec.Command("iptables", "-w", "5", "-L", "DOCKER-FORWARD", "-n")
249249
if checkChain.Run() != nil {
250250
t.Skip("DOCKER-FORWARD chain not present (Docker not running), skipping")
251251
}
252252

253253
// Verify jump currently exists.
254-
checkJump := exec.Command("iptables", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
254+
checkJump := exec.Command("iptables", "-w", "5", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
255255
require.NoError(t, checkJump.Run(), "DOCKER-FORWARD jump should exist before test")
256256

257257
// Safety net: restore the jump if the test fails or aborts after we delete it,
258258
// so we don't leave the host's Docker networking broken.
259259
t.Cleanup(func() {
260-
check := exec.Command("iptables", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
260+
check := exec.Command("iptables", "-w", "5", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
261261
if check.Run() != nil {
262-
restore := exec.Command("iptables", "-A", "FORWARD", "-j", "DOCKER-FORWARD")
262+
restore := exec.Command("iptables", "-w", "5", "-A", "FORWARD", "-j", "DOCKER-FORWARD")
263263
_ = restore.Run()
264264
}
265265
})
266266

267267
// Simulate the hypervisor flush: remove every jump.
268268
for {
269-
delJump := exec.Command("iptables", "-D", "FORWARD", "-j", "DOCKER-FORWARD")
269+
delJump := exec.Command("iptables", "-w", "5", "-D", "FORWARD", "-j", "DOCKER-FORWARD")
270270
if err := delJump.Run(); err != nil {
271271
break
272272
}
273273
}
274274

275275
// Confirm it's gone.
276-
checkGone := exec.Command("iptables", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
276+
checkGone := exec.Command("iptables", "-w", "5", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
277277
require.Error(t, checkGone.Run(), "DOCKER-FORWARD jump should be gone after delete")
278278

279279
// Re-initialize network — this should restore the jump.
280280
require.NoError(t, manager.networkManager.Initialize(ctx, nil))
281281

282282
// Verify jump is restored.
283-
checkRestored := exec.Command("iptables", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
283+
checkRestored := exec.Command("iptables", "-w", "5", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
284284
require.NoError(t, checkRestored.Run(), "ensureDockerForwardJump should have restored the DOCKER-FORWARD jump")
285285
}
286286

lib/network/bridge_linux.go

Lines changed: 24 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,18 @@ import (
2020
)
2121

2222
const netlinkDumpRetryCount = 3
23+
const iptablesWaitSeconds = "5"
24+
25+
func newIPTablesCommand(args ...string) *exec.Cmd {
26+
fullArgs := make([]string, 0, len(args)+2)
27+
fullArgs = append(fullArgs, "-w", iptablesWaitSeconds)
28+
fullArgs = append(fullArgs, args...)
29+
cmd := exec.Command("iptables", fullArgs...)
30+
cmd.SysProcAttr = &syscall.SysProcAttr{
31+
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
32+
}
33+
return cmd
34+
}
2335

2436
func listBridgeAddrsWithRetry(link netlink.Link) ([]netlink.Addr, error) {
2537
var err error
@@ -288,13 +300,10 @@ func (m *manager) setupIPTablesRules(ctx context.Context, subnet, bridgeName str
288300
// ensureNATRule ensures the MASQUERADE rule exists with correct uplink
289301
func (m *manager) ensureNATRule(subnet, uplink, comment string) (string, error) {
290302
// Check if rule exists with correct subnet and uplink
291-
checkCmd := exec.Command("iptables", "-t", "nat", "-C", "POSTROUTING",
303+
checkCmd := newIPTablesCommand("-t", "nat", "-C", "POSTROUTING",
292304
"-s", subnet, "-o", uplink,
293305
"-m", "comment", "--comment", comment,
294306
"-j", "MASQUERADE")
295-
checkCmd.SysProcAttr = &syscall.SysProcAttr{
296-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
297-
}
298307
if checkCmd.Run() == nil {
299308
return "existing", nil
300309
}
@@ -303,13 +312,10 @@ func (m *manager) ensureNATRule(subnet, uplink, comment string) (string, error)
303312
m.deleteNATRuleByComment(comment)
304313

305314
// Add rule with comment
306-
addCmd := exec.Command("iptables", "-t", "nat", "-A", "POSTROUTING",
315+
addCmd := newIPTablesCommand("-t", "nat", "-A", "POSTROUTING",
307316
"-s", subnet, "-o", uplink,
308317
"-m", "comment", "--comment", comment,
309318
"-j", "MASQUERADE")
310-
addCmd.SysProcAttr = &syscall.SysProcAttr{
311-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
312-
}
313319
if err := addCmd.Run(); err != nil {
314320
return "", fmt.Errorf("add masquerade rule: %w", err)
315321
}
@@ -327,10 +333,7 @@ func (m *manager) ruleComment(base string) string {
327333
// deleteNATRuleByComment deletes any NAT POSTROUTING rule containing our comment
328334
func (m *manager) deleteNATRuleByComment(comment string) {
329335
// List NAT POSTROUTING rules
330-
cmd := exec.Command("iptables", "-t", "nat", "-L", "POSTROUTING", "--line-numbers", "-n")
331-
cmd.SysProcAttr = &syscall.SysProcAttr{
332-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
333-
}
336+
cmd := newIPTablesCommand("-t", "nat", "-L", "POSTROUTING", "--line-numbers", "-n")
334337
output, err := cmd.Output()
335338
if err != nil {
336339
return
@@ -350,10 +353,7 @@ func (m *manager) deleteNATRuleByComment(comment string) {
350353

351354
// Delete in reverse order
352355
for i := len(ruleNums) - 1; i >= 0; i-- {
353-
delCmd := exec.Command("iptables", "-t", "nat", "-D", "POSTROUTING", ruleNums[i])
354-
delCmd.SysProcAttr = &syscall.SysProcAttr{
355-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
356-
}
356+
delCmd := newIPTablesCommand("-t", "nat", "-D", "POSTROUTING", ruleNums[i])
357357
delCmd.Run() // ignore error
358358
}
359359
}
@@ -369,14 +369,11 @@ func (m *manager) ensureForwardRule(inIface, outIface, ctstate, comment string,
369369
m.deleteForwardRuleByComment(comment)
370370

371371
// Insert at specified position with comment
372-
addCmd := exec.Command("iptables", "-I", "FORWARD", fmt.Sprintf("%d", position),
372+
addCmd := newIPTablesCommand("-I", "FORWARD", fmt.Sprintf("%d", position),
373373
"-i", inIface, "-o", outIface,
374374
"-m", "conntrack", "--ctstate", ctstate,
375375
"-m", "comment", "--comment", comment,
376376
"-j", "ACCEPT")
377-
addCmd.SysProcAttr = &syscall.SysProcAttr{
378-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
379-
}
380377
if err := addCmd.Run(); err != nil {
381378
return "", fmt.Errorf("insert forward rule: %w", err)
382379
}
@@ -386,10 +383,7 @@ func (m *manager) ensureForwardRule(inIface, outIface, ctstate, comment string,
386383
// isForwardRuleCorrect checks if our rule exists at the expected position with correct interfaces
387384
func (m *manager) isForwardRuleCorrect(inIface, outIface, comment string, position int) bool {
388385
// List FORWARD chain with line numbers
389-
cmd := exec.Command("iptables", "-L", "FORWARD", "--line-numbers", "-n", "-v")
390-
cmd.SysProcAttr = &syscall.SysProcAttr{
391-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
392-
}
386+
cmd := newIPTablesCommand("-L", "FORWARD", "--line-numbers", "-n", "-v")
393387
output, err := cmd.Output()
394388
if err != nil {
395389
return false
@@ -417,10 +411,7 @@ func (m *manager) isForwardRuleCorrect(inIface, outIface, comment string, positi
417411
// deleteForwardRuleByComment deletes any FORWARD rule containing our comment
418412
func (m *manager) deleteForwardRuleByComment(comment string) {
419413
// List FORWARD rules
420-
cmd := exec.Command("iptables", "-L", "FORWARD", "--line-numbers", "-n")
421-
cmd.SysProcAttr = &syscall.SysProcAttr{
422-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
423-
}
414+
cmd := newIPTablesCommand("-L", "FORWARD", "--line-numbers", "-n")
424415
output, err := cmd.Output()
425416
if err != nil {
426417
return
@@ -440,10 +431,7 @@ func (m *manager) deleteForwardRuleByComment(comment string) {
440431

441432
// Delete in reverse order
442433
for i := len(ruleNums) - 1; i >= 0; i-- {
443-
delCmd := exec.Command("iptables", "-D", "FORWARD", ruleNums[i])
444-
delCmd.SysProcAttr = &syscall.SysProcAttr{
445-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
446-
}
434+
delCmd := newIPTablesCommand("-D", "FORWARD", ruleNums[i])
447435
delCmd.Run() // ignore error
448436
}
449437
}
@@ -460,19 +448,13 @@ func (m *manager) ensureDockerForwardJump(ctx context.Context) {
460448
log := logger.FromContext(ctx)
461449

462450
// Check if DOCKER-FORWARD chain exists (Docker is installed and configured)
463-
checkChain := exec.Command("iptables", "-L", "DOCKER-FORWARD", "-n")
464-
checkChain.SysProcAttr = &syscall.SysProcAttr{
465-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
466-
}
451+
checkChain := newIPTablesCommand("-L", "DOCKER-FORWARD", "-n")
467452
if checkChain.Run() != nil {
468453
return // Chain doesn't exist — Docker not installed or not configured
469454
}
470455

471456
// Check if jump already exists in FORWARD
472-
checkJump := exec.Command("iptables", "-C", "FORWARD", "-j", "DOCKER-FORWARD")
473-
checkJump.SysProcAttr = &syscall.SysProcAttr{
474-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
475-
}
457+
checkJump := newIPTablesCommand("-C", "FORWARD", "-j", "DOCKER-FORWARD")
476458
if checkJump.Run() == nil {
477459
return // Jump already present
478460
}
@@ -481,10 +463,7 @@ func (m *manager) ensureDockerForwardJump(ctx context.Context) {
481463
// Insert right after hypeman's last rule so the jump is evaluated before any
482464
// explicit DROP/REJECT rules that an external firewall tool may have added.
483465
insertPos := m.lastHypemanForwardRulePosition() + 1
484-
addJump := exec.Command("iptables", "-I", "FORWARD", fmt.Sprintf("%d", insertPos), "-j", "DOCKER-FORWARD")
485-
addJump.SysProcAttr = &syscall.SysProcAttr{
486-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
487-
}
466+
addJump := newIPTablesCommand("-I", "FORWARD", fmt.Sprintf("%d", insertPos), "-j", "DOCKER-FORWARD")
488467
if err := addJump.Run(); err != nil {
489468
log.WarnContext(ctx, "failed to restore Docker FORWARD chain jump", "error", err)
490469
return
@@ -496,10 +475,7 @@ func (m *manager) ensureDockerForwardJump(ctx context.Context) {
496475
// lastHypemanForwardRulePosition returns the line number of the last hypeman-managed
497476
// rule in the FORWARD chain, or 0 if none are found.
498477
func (m *manager) lastHypemanForwardRulePosition() int {
499-
cmd := exec.Command("iptables", "-L", "FORWARD", "--line-numbers", "-n", "-v")
500-
cmd.SysProcAttr = &syscall.SysProcAttr{
501-
AmbientCaps: []uintptr{unix.CAP_NET_ADMIN},
502-
}
478+
cmd := newIPTablesCommand("-L", "FORWARD", "--line-numbers", "-n", "-v")
503479
output, err := cmd.Output()
504480
if err != nil {
505481
return 0

skills/test-agent/agents/test-agent/NOTES.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,41 @@
259259
- Run 1: pass (`lib/instances` 193.279s)
260260
- Run 2: pass (`lib/instances` 261.633s)
261261
- Run 3: pass (`lib/instances` 173.573s)
262+
263+
## 2026-04-07 - PR #184 follow-up CI round on `codex/standby-compression-delay`
264+
265+
### Initial CI red signature
266+
- Linux `test` job failed on `TestDockerForwardChainRestored`.
267+
- Failure:
268+
- `ensureDockerForwardJump should have restored the DOCKER-FORWARD jump`
269+
- raw `iptables -C FORWARD -j DOCKER-FORWARD` exited non-zero in the test after re-initialization.
270+
271+
### Root cause and fix
272+
- The Docker-forward recovery path and the test both used plain `iptables` invocations with no wait for the xtables lock.
273+
- Under parallel CI activity, a transient lock holder can cause checks/deletes/inserts to fail immediately and make the test observe a missing rule even though the recovery logic is otherwise correct.
274+
- Fix:
275+
- Added a small `newIPTablesCommand` helper in `lib/network/bridge_linux.go` that uses `iptables -w 5 ...` with the existing `CAP_NET_ADMIN` setup.
276+
- Switched the bridge NAT/FORWARD rule management and `ensureDockerForwardJump` commands to that helper.
277+
- Updated `TestDockerForwardChainRestored` in `lib/instances/network_test.go` to use `iptables -w 5` for its direct host-global mutations/checks.
278+
279+
### Secondary flake surfaced during Deft reruns
280+
- A subsequent Deft full-suite rerun exposed a post-restore guest exec race in `TestCloudHypervisorStandbyRestoreCompressionScenarios`:
281+
- `receive response (stdout=0, stderr=0): rpc error: code = DeadlineExceeded desc = stream terminated by RST_STREAM with error code: CANCEL`
282+
- The compression integration harness was only waiting for the exec agent socket and then issuing marker reads/writes immediately after restore.
283+
- Fix:
284+
- Added a no-op post-restore guest exec readiness probe in `waitForRunningAndExecReady`.
285+
- Added a small retry wrapper for the compression integration test’s guest marker read/write commands so transient post-restore transport resets do not fail the scenario immediately.
286+
287+
### Validation
288+
- Deft targeted loop:
289+
- `go test -count=20 -run '^TestDockerForwardChainRestored$' -v ./lib/instances`
290+
- Result: pass
291+
- Deft targeted loop:
292+
- `go test -count=10 -run '^TestCloudHypervisorStandbyRestoreCompressionScenarios$' -tags containers_image_openpgp -timeout=30m ./lib/instances`
293+
- Result: pass
294+
- Local sanity:
295+
- `go test ./lib/instances -count=1`
296+
- Result: pass (`117.538s`)
262297
- `exec-agent not ready for instance ... within 15s (last state: Initializing)`
263298

264299
### Additional flakes reproduced during Deft full-suite verification

0 commit comments

Comments
 (0)