Skip to content

Commit 2223efd

Browse files
committed
Instrument the Windows job-reap test's child-side failures
The job-reap test still fails on windows-latest with an empty captured stderr, which proves the server ran its whole script but cannot say anything about the child. Close the remaining blind spots: - Route the child's stderr into the server's (which errlog captures), so a child that dies at startup leaves its traceback in the failure message instead of vanishing into the hidden console. - Print a child-started marker to stderr first, splitting "child never spawned" from "child started but could not connect". - Report how many of the two liveness connections arrived and which leg is missing, instead of one undifferentiated timeout. - Record when the stdio_client context was entered and include the spawn-to-entry split in the failure message, so a stalled spawn is distinguishable from a stalled child. Verified on POSIX by running the same choreography with a deliberately unreachable child port: the failure message names the missing leg and quotes the child's ConnectionRefusedError traceback verbatim.
1 parent c5eff45 commit 2223efd

1 file changed

Lines changed: 25 additions & 8 deletions

File tree

tests/transports/stdio/test_windows.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,21 @@ async def test_a_gracefully_exited_servers_child_is_reaped_when_the_job_handle_c
5757
through the graceful path's job-handle close and not through the escalation's
5858
`TerminateJobObject` — the two kills are indistinguishable on the socket.
5959
60-
The server connects back too (not just the child), and its stderr is captured
61-
through `errlog`: a failure then says *which* process never arrived and what the
62-
server printed, instead of one silent timeout — xdist swallows subprocess stderr
63-
on CI, so without the capture a broken spawn chain is undiagnosable there.
60+
The server connects back too (not just the child), the child's stderr is routed
61+
into the server's, and both are captured through `errlog`; the child also prints
62+
a startup marker there. A timeout failure then reports how many connections
63+
arrived (so which process never showed), how long the spawn took, and the
64+
captured stderr verbatim — xdist swallows subprocess stderr on CI, so without
65+
the capture a broken spawn chain is undiagnosable there.
6466
"""
6567
async with AsyncExitStack() as stack:
6668
sock, port = await open_liveness_listener()
6769
stack.push_async_callback(sock.aclose)
6870

69-
child = connect_back_script(port)
71+
# The startup marker (and any child traceback, via the Popen's
72+
# stderr=sys.stderr below) lands in errlog, splitting "child never
73+
# spawned/started" from "child started but could not connect".
74+
child = "import sys\nprint('child-started', file=sys.stderr, flush=True)\n" + connect_back_script(port)
7075
# The server spawns a child (its Popen failure, if any, is surfaced on
7176
# stderr), connects back itself, then exits as soon as its stdin closes —
7277
# the well-behaved graceful path, so the escalation never runs. The child
@@ -77,7 +82,7 @@ async def test_a_gracefully_exited_servers_child_is_reaped_when_the_job_handle_c
7782
server = (
7883
f"import socket, subprocess, sys\n"
7984
f"try:\n"
80-
f" subprocess.Popen([sys.executable, '-c', {child!r}])\n"
85+
f" subprocess.Popen([sys.executable, '-c', {child!r}], stderr=sys.stderr)\n"
8186
f"except BaseException as exc:\n"
8287
f" print(exc, file=sys.stderr, flush=True)\n"
8388
f" raise\n"
@@ -93,20 +98,32 @@ def server_stderr() -> str:
9398
errlog.seek(0)
9499
return errlog.read()
95100

101+
streams: list[anyio.abc.SocketStream] = []
102+
spawn_started = anyio.current_time()
103+
entered_at: float | None = None
96104
try:
97105
# The bound covers two Python interpreter cold starts on a loaded
98106
# runner; a healthy run takes well under a second.
99107
with anyio.fail_after(15.0):
100108
async with stdio_client(server_params, errlog=errlog):
109+
entered_at = anyio.current_time()
101110
# The server and child race to connect; accept both,
102111
# order-agnostic (accept_alive verifies each banner).
103-
streams: list[anyio.abc.SocketStream] = []
104112
for _ in range(2):
105113
stream = await accept_alive(sock)
106114
stack.push_async_callback(stream.aclose)
107115
streams.append(stream)
108116
except TimeoutError:
109-
pytest.fail(f"a liveness connection never arrived; server stderr: {server_stderr()!r}")
117+
missing_leg = "the server never ran its connect line" if not streams else "the child never connected"
118+
spawn_split = (
119+
"the context never entered"
120+
if entered_at is None
121+
else f"the context entered {entered_at - spawn_started:.1f}s after spawn began"
122+
)
123+
pytest.fail(
124+
f"{len(streams)}/2 liveness connections arrived ({missing_leg}); "
125+
f"{spawn_split}; server stderr: {server_stderr()!r}"
126+
)
110127

111128
# Both peers connected and the context has fully exited, closing the
112129
# job handle. KILL_ON_JOB_CLOSE must have killed the child, and the

0 commit comments

Comments
 (0)