Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/full-suite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ jobs:
test-shards:
name: Test shard ${{ matrix.shard }}/4
runs-on: ubuntu-latest
# Backstop for a wedged shard. The per-invocation watchdog (L2,
# scripts/lib/run-vitest-watchdog.mjs) kills any single hung invocation at
# its budget ceiling (<=30min), so this job budget only fires if L2 itself
# wedges — and it must sit strictly above that ceiling so L2 fires first.
# Without this, a hang ran to GitHub's 6h default (the silent-black-hole bug
# this plan closes).
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -119,6 +126,9 @@ jobs:
test-inventory-guard:
name: Dashboard curated-gate guard
runs-on: ubuntu-latest
# Runs only `vitest list` (no tests execute), so this is generous headroom,
# not a tight bound — but no CI job should be able to hang to the 6h ceiling.
timeout-minutes: 20
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -163,6 +173,9 @@ jobs:
test-slow:
name: Engine slow tier
runs-on: ubuntu-latest
# Real-git slow suites; same backstop rationale as test-shards. Sits above
# the L2 per-invocation ceiling so the watchdog fires first on a single hang.
timeout-minutes: 60
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
380 changes: 380 additions & 0 deletions docs/plans/2026-06-13-001-fix-test-timeout-failures-plan.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,11 @@ async function spawnWrapperTreeUntilTimeout() {
stdio: "pipe",
env: {
...process.env,
FUSION_RUN_VITEST_TIMEOUT_MS: "100",
FUSION_RUN_VITEST_KILL_GRACE_MS: "50",
// Budget must outlast the stub child's startup so the test can observe
// it alive before the watchdog reaps it; the contract under test is
// "timeout -> exit 124 + group reaped", not the exact budget value.
FUSION_RUN_VITEST_TIMEOUT_MS: "2000",
FUSION_RUN_VITEST_KILL_GRACE_MS: "200",
FUSION_RUN_VITEST_SPAWN_OVERRIDE: JSON.stringify({
command: process.execPath,
args: [childPath, pidFile, grandchildPath],
Expand Down Expand Up @@ -230,16 +233,17 @@ afterEach(async () => {
describe("run-vitest-with-heap", () => {
it("reaps the spawned process group on SIGTERM", async () => {
const { stderr } = await spawnWrapperTree("SIGTERM");
expect(stderr).toContain("[dashboard-vitest] received SIGTERM; forwarding to vitest process group");
expect(stderr).toContain("[watchdog] received SIGTERM; forwarding to group");
});

it("reaps the spawned process group on SIGINT", async () => {
const { stderr } = await spawnWrapperTree("SIGINT");
expect(stderr).toContain("[dashboard-vitest] received SIGINT; forwarding to vitest process group");
expect(stderr).toContain("[watchdog] received SIGINT; forwarding to group");
});

it("times out and reaps the spawned process group", async () => {
const { stderr } = await spawnWrapperTreeUntilTimeout();
expect(stderr).toContain("[dashboard-vitest] timeout after 100ms");
expect(stderr).toContain("[watchdog] HANG:");
expect(stderr).toContain("exceeded budget 2000ms");
});
});
149 changes: 42 additions & 107 deletions packages/dashboard/scripts/run-vitest-with-heap.mjs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env node
/* global clearInterval, clearTimeout, console, process, setInterval, setTimeout */
/* global console, process */

import { spawn } from "node:child_process";

import { runWithWatchdog } from "../../../scripts/lib/run-vitest-watchdog.mjs";

const rawArgs = process.argv.slice(2);
const heapArg = rawArgs.find((arg) => arg.startsWith("--heap="));
const heapMb = heapArg?.slice("--heap=".length) || "6144";
Expand All @@ -16,8 +18,21 @@ if (vitestArgs.length === 0) {
const nodeOptions = [`--max-old-space-size=${heapMb}`, process.env.NODE_OPTIONS || ""]
.join(" ")
.trim();
const timeoutMs = Number.parseInt(process.env.FUSION_RUN_VITEST_TIMEOUT_MS || "900000", 10);
const forceKillGraceMs = Number.parseInt(process.env.FUSION_RUN_VITEST_KILL_GRACE_MS || "5000", 10);
// Clamp to the default on a missing/malformed value. A bad env value must never
// produce NaN — the watchdog only arms when budgetMs is finite and > 0, so a
// NaN here would silently disable the killer and bring back the very hang this
// wrapper exists to prevent.
function positiveIntEnv(name, fallback) {
const raw = process.env[name];
if (raw == null || raw === "") return fallback;
const parsed = Number.parseInt(raw, 10);
if (Number.isInteger(parsed) && parsed > 0) return parsed;
console.error(`[dashboard-vitest] ignoring invalid ${name}=${JSON.stringify(raw)}; using ${fallback}`);
return fallback;
}

const timeoutMs = positiveIntEnv("FUSION_RUN_VITEST_TIMEOUT_MS", 900000);
const graceMs = positiveIntEnv("FUSION_RUN_VITEST_KILL_GRACE_MS", 5000);

function resolveSpawnCommand() {
const override = process.env.FUSION_RUN_VITEST_SPAWN_OVERRIDE;
Expand All @@ -43,110 +58,30 @@ function resolveSpawnCommand() {
}

const { command, args } = resolveSpawnCommand();
// process-supervisor-allowlist: foreground wrapper signals the entire vitest process group on death/timeout; not a background daemon
const child = spawn(command, args, {
detached: true,
stdio: "inherit",
const label = vitestArgs.join(" ");

// Dashboard lanes keep their historical fixed budget (default 15min) rather than
// the timings-derived bands the shard/changed runners use — heap pressure, not
// duration, is what wedges a lane, so a flat generous budget is correct here.
runWithWatchdog({
command,
args,
env: { ...process.env, NODE_OPTIONS: nodeOptions },
});

const heartbeat = setInterval(() => {
console.log(`[dashboard-vitest] still running: ${vitestArgs.join(" ")}`);
}, 5_000);
let timeoutExitCode = null;
let forceKillTimer = null;
let lastForwardedSignal = null;
let lastForwardReason = null;
const timeout = Number.isFinite(timeoutMs) && timeoutMs > 0
? setTimeout(() => {
timeoutExitCode = 124;
console.error(`[dashboard-vitest] timeout after ${timeoutMs}ms: ${vitestArgs.join(" ")}`);
forwardSignal("SIGTERM", "timeout");
forceKillTimer = setTimeout(() => {
forwardSignal("SIGKILL", "timeout-grace-expired");
}, Math.max(1, forceKillGraceMs));
forceKillTimer.unref();
}, timeoutMs)
: null;
timeout?.unref();

function clearHeartbeat() {
clearInterval(heartbeat);
}

function clearTimers() {
clearHeartbeat();
if (timeout) clearTimeout(timeout);
if (forceKillTimer) clearTimeout(forceKillTimer);
}

function forwardSignal(signal, reason = "external-signal") {
clearHeartbeat();
lastForwardedSignal = signal;
lastForwardReason = reason;

try {
process.kill(-child.pid, signal);
return;
} catch (error) {
if (!(error instanceof Error) || !("code" in error)) {
throw error;
budgetMs: timeoutMs,
graceMs,
label,
log: console.error,
spawn,
})
.then(({ code, signal, timedOut }) => {
if (signal) {
// Re-raise the child's terminating signal so the wrapper exits the same way.
process.kill(process.pid, signal);
return;
}

if (error.code !== "ESRCH" && error.code !== "EPERM") {
throw error;
}
}

try {
child.kill(signal);
} catch (error) {
if (!(error instanceof Error) || !("code" in error) || error.code !== "ESRCH") {
throw error;
}
}
}

for (const signal of ["SIGINT", "SIGTERM", "SIGHUP"]) {
process.on(signal, () => {
console.error(`[dashboard-vitest] received ${signal}; forwarding to vitest process group: ${vitestArgs.join(" ")}`);
forwardSignal(signal, "wrapper-received-signal");
process.exit(code ?? (timedOut ? 124 : 1));
})
.catch((error) => {
console.error(error);
process.exit(1);
});
}

process.on("exit", () => {
clearTimers();
try {
process.kill(-child.pid, "SIGTERM");
} catch (error) {
if (
!(error instanceof Error) ||
!("code" in error) ||
(error.code !== "ESRCH" && error.code !== "EPERM")
) {
throw error;
}
}
});

child.on("error", (error) => {
clearTimers();
console.error(error);
process.exit(1);
});

child.on("close", (code, signal) => {
clearTimers();
if (timeoutExitCode !== null) {
process.exit(timeoutExitCode);
}
if (signal) {
const forwardedContext = lastForwardedSignal
? ` after forwarding ${lastForwardedSignal} (${lastForwardReason ?? "unknown-reason"})`
: " without a wrapper-forwarded signal";
console.error(`[dashboard-vitest] child exited via ${signal}${forwardedContext}: ${vitestArgs.join(" ")}`);
process.kill(process.pid, signal);
return;
}
process.exit(code ?? 1);
});
Loading
Loading