From 4f8e5a037bbfd40bb505ec61fcf02841da92a55d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 11 Jun 2026 23:18:52 +0000 Subject: [PATCH 1/2] fix: make peek after kill deterministically report no session The daemon acked quit while its socket file and listener still existed; teardown unlinked the socket only afterwards. A control command issued right after kill returned could connect in that window, read EOF, and die with error.ConnectionLost (exit 1) instead of the documented no-session exit 3. This flaked the agent-loop integration test on macOS CI. Retire the listener (close it and unlink the socket file) before acking quit, so by the time kill returns no new client can find the socket. Also map ConnectionLost in mustControl to the no-session failure: an EOF on a control exchange means the daemon died before replying, which callers should see as the session being gone. This covers control connections racing a natural command exit as well. --- src/main.zig | 8 +++++--- test/integration.zig | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/main.zig b/src/main.zig index 3b68a4c..9ce5055 100644 --- a/src/main.zig +++ b/src/main.zig @@ -190,8 +190,10 @@ pub fn sessionInfo(alloc: std.mem.Allocator, dir: []const u8, name: []const u8) }; } -/// Run a control command against a session, mapping a missing daemon -/// to the documented exit code. +/// Run a control command against a session, mapping a missing or +/// mid-teardown daemon to the documented exit code. An EOF on the +/// control connection means the daemon died before replying, so it is +/// reported the same as a daemon that is already gone. fn mustControl( alloc: std.mem.Allocator, dir: []const u8, @@ -201,7 +203,7 @@ fn mustControl( const sock = try paths.socketPath(alloc, dir, name); defer alloc.free(sock); return client.control(alloc, sock, argv) catch |err| switch (err) { - error.FileNotFound, error.ConnectionRefused => fail( + error.FileNotFound, error.ConnectionRefused, error.ConnectionLost => fail( exit_no_session, "no session named {s}", .{name}, diff --git a/test/integration.zig b/test/integration.zig index ddbd0cb..c7336ef 100644 --- a/test/integration.zig +++ b/test/integration.zig @@ -1193,6 +1193,25 @@ test "agent loop: new, send, wait, peek, kill" { try h.runExit(&.{ "peek", "agent" }, 3); } +test "kill: peek immediately after kill reports no session" { + const alloc = std.testing.allocator; + var h = try Harness.init(alloc); + defer h.deinit(); + + // Once kill is acked the socket file is already unlinked, so a + // back-to-back peek must deterministically resolve "no session" + // (exit 3) and never observe EOF from the dying daemon. Repeat to + // amplify the former race between the kill ack and teardown. + var i: usize = 0; + var name_buf: [16]u8 = undefined; + while (i < 10) : (i += 1) { + const name = try std.fmt.bufPrint(&name_buf, "reap{d}", .{i}); + try h.startDetached(name, &.{"sh"}); + try h.runOk(&.{ "kill", name }); + try h.runExit(&.{ "peek", name }, 3); + } +} + test "rename: moves a session to a new name" { const alloc = std.testing.allocator; var h = try Harness.init(alloc); From c4962d555ad99171e3fd970685fc7ccce0f746b5 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 11 Jun 2026 23:21:22 +0000 Subject: [PATCH 2/2] fix: retire the listener before acking quit Restores the daemon half of the race fix: close the listening socket and unlink its file before replying to quit, so the socket is gone by the time kill returns. --- src/daemon.zig | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/daemon.zig b/src/daemon.zig index 6693e61..6f0d22e 100644 --- a/src/daemon.zig +++ b/src/daemon.zig @@ -119,14 +119,23 @@ pub const Daemon = struct { self.alloc.destroy(c); } self.conns.deinit(self.alloc); - posix.close(self.opts.listen_fd); - std.fs.cwd().deleteFile(self.opts.socket_path) catch {}; + self.retireListener(); if (self.owned_name) |n| self.alloc.free(n); if (self.owned_socket_path) |p| self.alloc.free(p); if (self.sig_read >= 0) posix.close(self.sig_read); if (sigchld_pipe >= 0) posix.close(sigchld_pipe); } + /// Close the listening socket and remove its file so new clients + /// resolve "no session" instead of connecting to a dying daemon + /// and reading EOF. + fn retireListener(self: *Daemon) void { + if (self.opts.listen_fd < 0) return; + posix.close(self.opts.listen_fd); + self.opts.listen_fd = -1; + std.fs.cwd().deleteFile(self.opts.socket_path) catch {}; + } + fn loop(self: *Daemon) !void { var fds: std.ArrayList(posix.pollfd) = .empty; defer fds.deinit(self.alloc); @@ -397,6 +406,11 @@ pub const Daemon = struct { } self.rename(conn, argv[1]); } else if (std.mem.eql(u8, cmd, "quit")) { + // Retire the listener before acking: by the time the kill + // client sees the reply, the socket file is gone, so a + // follow-up command resolves "no session" instead of + // connecting to the dying daemon and reading EOF. + self.retireListener(); conn.send(.ok, ""); if (self.win) |w| { posix.kill(w.child_pid, posix.SIG.HUP) catch {};