From ec8da57e9305640ecbba873c632646b7413e265b Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Sun, 21 Jun 2026 11:08:40 -0400
Subject: [PATCH 01/38] feat(ci-driver): wizard-ci-tools control plane for
headless e2e + record/replay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A control plane over the TUI store that drives the wizard end-to-end with no
terminal and no browser, for CI/e2e and agent-driven testing. The render is a
pure function of the nanostore, so driving committed state == driving the UI.
Core files (src/lib/ci-driver/):
- wizard-ci-driver.ts — read_state / list_actions / perform_action over a live
WizardStore. read_state is a truthful, secret-free projection of committed
state (+ derived currentScreen); perform_action commits via the exact store
setter the Ink screen's key handler calls.
- action-registry.ts — declarative screen -> commit-action map (exhaustive over
ScreenId/Overlay). The actuation surface: name an action, not a keystroke.
- wizard-ci-tools.ts — in-process MCP server exposing the three tools, so an
external harness or LLM can drive a real run.
- e2e-profile.ts — WizardE2eProfile: a program's declarative e2e test definition
(the UI choices). decideE2eAction(state, profile) maps screen -> commit, so
the harness is generic and the choices live on the program.
- recorder.ts — captures a frame at each key moment (route/task/status/runPhase/
overlay change) off the store's version counter; redacts the access token.
- replay.ts — reconstructs a throwaway store per frame and renders the REAL Ink
screen back to ANSI, so a run replays in the terminal.
- DRIVING-E2E-FROM-AN-AGENT.md — how a future agent drives these.
- __tests__/ — control-plane walk, flow snapshot (TUI-snapshot analog), recorder.
Programs declare their flow's UI choices:
- programs/program-step.ts — ProgramConfig.e2e?: WizardE2eProfile.
- programs/posthog-integration/index.ts — the integration program's e2e profile.
Harness/entry scripts:
- scripts/e2e-full-run.no-jest.ts — headless full run: real WizardStore + InkUI
(never rendered) + concurrent driver + real runAgent; emits a structured
result + a recording.
- scripts/replay-e2e.no-jest.ts — replay a recording in the terminal.
- scripts/ci-driver-demo.ts — offline control-plane demo (no agent).
Additive; no core wizard behavior changed. The workbench `wizard-ci --e2e`
(PostHog/wizard-workbench) orchestrates these against real test apps.
Co-Authored-By: Claude Opus 4.8
---
scripts/ci-driver-demo.ts | 138 +++++++++
scripts/e2e-full-run.no-jest.ts | 281 ++++++++++++++++++
scripts/replay-e2e.no-jest.ts | 71 +++++
.../ci-driver/DRIVING-E2E-FROM-AN-AGENT.md | 130 ++++++++
.../e2e-flow-snapshot.test.ts.snap | 93 ++++++
.../__tests__/e2e-flow-snapshot.test.ts | 96 ++++++
src/lib/ci-driver/__tests__/recorder.test.ts | 103 +++++++
.../__tests__/wizard-ci-driver.test.ts | 183 ++++++++++++
src/lib/ci-driver/action-registry.ts | 270 +++++++++++++++++
src/lib/ci-driver/e2e-profile.ts | 154 ++++++++++
src/lib/ci-driver/recorder.ts | 159 ++++++++++
src/lib/ci-driver/replay.ts | 74 +++++
src/lib/ci-driver/wizard-ci-driver.ts | 192 ++++++++++++
src/lib/ci-driver/wizard-ci-tools.ts | 108 +++++++
src/lib/programs/posthog-integration/index.ts | 13 +
src/lib/programs/program-step.ts | 7 +
16 files changed, 2072 insertions(+)
create mode 100644 scripts/ci-driver-demo.ts
create mode 100644 scripts/e2e-full-run.no-jest.ts
create mode 100644 scripts/replay-e2e.no-jest.ts
create mode 100644 src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
create mode 100644 src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
create mode 100644 src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts
create mode 100644 src/lib/ci-driver/__tests__/recorder.test.ts
create mode 100644 src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts
create mode 100644 src/lib/ci-driver/action-registry.ts
create mode 100644 src/lib/ci-driver/e2e-profile.ts
create mode 100644 src/lib/ci-driver/recorder.ts
create mode 100644 src/lib/ci-driver/replay.ts
create mode 100644 src/lib/ci-driver/wizard-ci-driver.ts
create mode 100644 src/lib/ci-driver/wizard-ci-tools.ts
diff --git a/scripts/ci-driver-demo.ts b/scripts/ci-driver-demo.ts
new file mode 100644
index 00000000..a2575289
--- /dev/null
+++ b/scripts/ci-driver-demo.ts
@@ -0,0 +1,138 @@
+/**
+ * Headless control-plane demo — runs the real wizard store/router/detection
+ * flow with NO terminal and NO browser, driven entirely by WizardCiDriver.
+ *
+ * This is the runnable (tsx) sibling of the jest control-plane test: it proves
+ * the same loop works outside a test harness, against real framework detection
+ * on a 1-file project. The agent + auth steps are injected (the agent is a
+ * separate, token-burning concern proven elsewhere) so this stays fast and
+ * offline; every human decision goes through the driver's read/act surface.
+ *
+ * POSTHOG_WIZARD_INSTALL_DIR= tsx scripts/ci-driver-demo.ts
+ */
+
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase } from '@lib/wizard-session';
+import { Program } from '@lib/programs/program-registry';
+import { WizardReadiness } from '@lib/health-checks/readiness';
+import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
+import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+
+function makeOneFileProject(): string {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'wizard-ci-'));
+ // The one file: a package.json declaring Next.js. Framework detection keys
+ // off this, so the integration program resolves to the Next.js flow.
+ fs.writeFileSync(
+ path.join(dir, 'package.json'),
+ JSON.stringify(
+ { name: 'demo', private: true, dependencies: { next: '15.3.0' } },
+ null,
+ 2,
+ ),
+ );
+ return dir;
+}
+
+const log = (msg: string) => process.stdout.write(` ${msg}\n`);
+
+async function main() {
+ const installDir =
+ process.env.POSTHOG_WIZARD_INSTALL_DIR ?? makeOneFileProject();
+ process.stdout.write(`\nHeadless wizard-ci-tools demo on: ${installDir}\n\n`);
+
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store)); // real UI, never rendered → no Ink, no browser
+ const session = buildSession({ installDir, ci: true });
+ store.session = session;
+
+ const driver = new WizardCiDriver(store);
+
+ // Run the program's onReady hooks — this is REAL framework detection.
+ await store.runReadyHooks();
+
+ const seen: ScreenName[] = [];
+ const note = (s = driver.readState()) => {
+ if (seen[seen.length - 1] !== s.currentScreen) {
+ seen.push(s.currentScreen);
+ log(`screen → ${s.currentScreen}`);
+ }
+ return s;
+ };
+
+ note();
+ log(`detected framework: ${store.session.integration ?? '(none)'}`);
+
+ // Walk the flow. Each branch commits exactly what a user would, via the
+ // driver, and reads the resulting screen back.
+ for (let i = 0; i < 30; i++) {
+ const state = note();
+ const screen = state.currentScreen;
+
+ if (screen === ScreenId.Intro) {
+ driver.performAction('confirm_setup');
+ } else if (screen === ScreenId.HealthCheck) {
+ // Simulate the readiness probe coming back clean (offline-safe).
+ store.setReadinessResult({
+ decision: WizardReadiness.Yes,
+ health: {} as never,
+ reasons: [],
+ });
+ } else if (screen === ScreenId.Setup) {
+ const q = state.setupQuestions[0];
+ log(`answering setup "${q.key}" → ${q.options[0].value}`);
+ driver.performAction('choose', { key: q.key, value: q.options[0].value });
+ } else if (screen === ScreenId.Auth) {
+ // Inject credentials (the scoped phx key can't fetch project data; the
+ // gateway-bearer path is proven separately). accessToken would be the
+ // phx key in a real run.
+ store.setCredentials({
+ accessToken: 'phx_injected_for_demo',
+ projectApiKey: 'phc_demo',
+ host: 'https://us.posthog.com',
+ projectId: 1,
+ });
+ } else if (screen === ScreenId.Run) {
+ // Simulate the agent run completing (real agent proven via the gateway).
+ store.setRunPhase(RunPhase.Running);
+ store.setRunPhase(RunPhase.Completed);
+ } else if (screen === ScreenId.Outro) {
+ driver.performAction('dismiss_outro');
+ } else if (screen === ScreenId.Mcp) {
+ driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
+ } else if (screen === ScreenId.McpSuggestedPrompts) {
+ driver.performAction('dismiss');
+ } else if (screen === ScreenId.SlackConnect) {
+ driver.performAction('dismiss_slack');
+ } else if (screen === ScreenId.KeepSkills) {
+ driver.performAction('keep_skills', { kept: true });
+ break; // terminal: skillsComplete is the run's done-signal
+ } else if (screen === Overlay.WizardAsk) {
+ // Not used by the integration program, but handle it generically.
+ const q = state.pendingQuestion!.questions[0];
+ driver.performAction('answer_question', {
+ answers: { [q.id]: q.options?.[0]?.value ?? 'ok' },
+ });
+ } else {
+ log(`no driver branch for "${screen}" — stopping`);
+ break;
+ }
+ }
+
+ const done = store.session.skillsComplete;
+ process.stdout.write(
+ `\n${done ? '✓' : '✗'} skillsComplete=${done} path: ${seen.join(
+ ' → ',
+ )}\n\n`,
+ );
+ process.exit(done ? 0 : 1);
+}
+
+main().catch((e) => {
+ process.stderr.write(`\nDEMO_FAIL: ${e?.stack ?? e}\n`);
+ process.exit(1);
+});
diff --git a/scripts/e2e-full-run.no-jest.ts b/scripts/e2e-full-run.no-jest.ts
new file mode 100644
index 00000000..3bbea84a
--- /dev/null
+++ b/scripts/e2e-full-run.no-jest.ts
@@ -0,0 +1,281 @@
+/**
+ * Full headless e2e — runs the REAL wizard integration flow against prod cloud,
+ * driven entirely by WizardCiDriver. No Ink, no browser, no LoggingUI.
+ *
+ * Unlike classic `--ci` (LoggingUI: runs the agent then exits, skipping the
+ * intro / setup / mcp / slack / keep-skills screens and offering only
+ * stdout to assert on), this runs the WHOLE interactive flow — the driver makes
+ * each human-side decision through the same store setters the Ink UI would, and
+ * the run is observed through structured `read_state`.
+ *
+ * POSTHOG_PERSONAL_API_KEY=… APP_DIR=/tmp/run-x PROJECT_ID=228144 \
+ * tsx scripts/e2e-full-run.no-jest.ts
+ */
+
+import fs from 'fs';
+import path from 'path';
+import { execFileSync } from 'child_process';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase } from '@lib/wizard-session';
+import { Program } from '@lib/programs/program-registry';
+import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import { runAgent } from '@lib/agent/agent-runner';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import type { ScreenName } from '@ui/tui/router';
+import {
+ decideE2eAction,
+ DEFAULT_E2E_PROFILE,
+ type WizardE2eProfile,
+} from '@lib/ci-driver/e2e-profile';
+import { WizardRecorder } from '@lib/ci-driver/recorder';
+
+const log = (m: string) => process.stdout.write(`[e2e] ${m}\n`);
+
+/** Snapshot package.json deps + file list, to diff before/after. */
+function snapshot(dir: string): { deps: string[]; files: Set } {
+ const files = new Set();
+ const walk = (d: string, rel = '') => {
+ for (const e of fs.readdirSync(d, { withFileTypes: true })) {
+ if (e.name === 'node_modules' || e.name === '.git') continue;
+ const r = path.join(rel, e.name);
+ if (e.isDirectory()) walk(path.join(d, e.name), r);
+ else files.add(r);
+ }
+ };
+ walk(dir);
+ let deps: string[] = [];
+ try {
+ const pkg = JSON.parse(
+ fs.readFileSync(path.join(dir, 'package.json'), 'utf8'),
+ );
+ deps = Object.keys({ ...pkg.dependencies, ...pkg.devDependencies });
+ } catch {
+ /* no package.json (some frameworks) */
+ }
+ return { deps, files };
+}
+
+async function main() {
+ const apiKey = (process.env.POSTHOG_PERSONAL_API_KEY ?? '').trim();
+ const appDir = process.env.APP_DIR!;
+ const projectId = process.env.PROJECT_ID ?? '228144';
+ // Happy-path e2e policy: skip MCP + Slack always; KEEP vs DELETE skills is the
+ // one knob (default = delete, matching `wizard-ci --e2e`). Health-check issues
+ // are always dismissed so a transient outage never blocks the run.
+ const keepSkills = process.env.E2E_KEEP_SKILLS === 'true';
+ if (!apiKey) throw new Error('Set POSTHOG_PERSONAL_API_KEY');
+ if (!appDir || !fs.existsSync(appDir))
+ throw new Error(`APP_DIR missing: ${appDir}`);
+
+ const before = snapshot(appDir);
+ log(
+ `app: ${appDir} (project ${projectId}) files=${before.files.size} deps=${before.deps.length}`,
+ );
+
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store)); // real UI, never rendered
+ const session = buildSession({
+ installDir: appDir,
+ ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
+ apiKey,
+ projectId, // the key's scoped project — required, else bootstrap 403s
+ region: 'us',
+ });
+ store.session = session;
+
+ // Record the run as a timeline of key-moment frames (route changes, task
+ // updates, status lines, …) so it can be replayed in the terminal later.
+ const recorder = new WizardRecorder(store, {
+ program: 'posthog-integration',
+ app: path.basename(appDir),
+ });
+ recorder.start();
+
+ const driver = new WizardCiDriver(store);
+
+ // The program OWNS its e2e UI choices (ProgramConfig.e2e). The harness is
+ // generic: it asks decideE2eAction what to commit on each screen. The
+ // --keep-skills flag (E2E_KEEP_SKILLS) overrides the profile's skills policy.
+ const profile: WizardE2eProfile = {
+ ...DEFAULT_E2E_PROFILE,
+ ...(posthogIntegrationConfig.e2e ?? {}),
+ ...(keepSkills ? { skills: 'keep' as const } : {}),
+ };
+ log(`e2e profile: ${JSON.stringify(profile)}`);
+
+ // Concurrent driver loop: commits the profile's decision on each screen as it
+ // appears, until the run signals skillsComplete.
+ const seen: ScreenName[] = [];
+ let stop = false;
+ const driverLoop = async () => {
+ while (!stop && !store.session.skillsComplete) {
+ const state = driver.readState();
+ const before = state.currentScreen;
+ if (seen[seen.length - 1] !== before) {
+ seen.push(before);
+ log(`screen → ${before}`);
+ }
+ let acted = false;
+ try {
+ const decision = decideE2eAction(state, profile);
+ if (decision.action) {
+ driver.performAction(
+ decision.action.id,
+ decision.action.params ?? {},
+ );
+ acted = true;
+ }
+ if (decision.done) stop = true;
+ } catch (e) {
+ log(`driver action error on ${before}: ${(e as Error).message}`);
+ }
+ // If our own commit already advanced the screen (driver-driven sequences
+ // like outro→mcp→slack→keep-skills), loop immediately to drive the next
+ // one. Only block on waitForChange when we're waiting on an EXTERNAL
+ // transition (the health probe, auth bootstrap, or the agent run).
+ if (acted && store.currentScreen !== before) continue;
+ await driver.waitForChange(600_000);
+ }
+ };
+
+ const drive = driverLoop();
+
+ // Reproduce run-wizard.ts headlessly: detection → init probe → gates → agent.
+ await store.runReadyHooks();
+ log(`detected: ${store.session.integration ?? '(none)'}`);
+ store.runInitHooks(); // fires the health-check readiness probe
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ log('gates cleared (intro + health) — starting real agent');
+
+ await runAgent(posthogIntegrationConfig, store.session);
+ log(`agent run finished: runPhase=${store.session.runPhase}`);
+
+ // Let the driver walk the post-run screens to completion.
+ const deadline = Date.now() + 120_000;
+ while (!store.session.skillsComplete && Date.now() < deadline) {
+ await driver.waitForChange(5_000);
+ }
+ stop = true;
+ await Promise.race([drive, Promise.resolve()]);
+
+ // "Delete skills" is a KeepSkillsScreen side-effect (it `rm`s the
+ // wizard-installed skill dirs), not a store setter — so the headless driver's
+ // keep_skills{kept:false} only flips the flag. Replicate the deletion here, in
+ // the orchestrator, where fs side-effects belong. Mirrors the screen: remove
+ // each wizard-marked skill dir, then the skills/ dir if it's left empty.
+ let skillsDeleted = false;
+ if (profile.skills === 'delete') {
+ const skillsDir = path.join(appDir, '.claude', 'skills');
+ if (fs.existsSync(skillsDir)) {
+ for (const dir of fs.readdirSync(skillsDir, { withFileTypes: true })) {
+ if (!dir.isDirectory()) continue;
+ if (fs.existsSync(path.join(skillsDir, dir.name, '.posthog-wizard'))) {
+ fs.rmSync(path.join(skillsDir, dir.name), {
+ recursive: true,
+ force: true,
+ });
+ skillsDeleted = true;
+ }
+ }
+ if (fs.readdirSync(skillsDir).length === 0) {
+ fs.rmSync(skillsDir, { recursive: true, force: true });
+ }
+ }
+ log(`skills deleted: ${skillsDeleted}`);
+ }
+
+ // Assertions: structured state + real file changes.
+ const after = snapshot(appDir);
+ const newDeps = after.deps.filter((d) => !before.deps.includes(d));
+ const newFiles = [...after.files].filter((f) => !before.files.has(f));
+ const hasPosthogDep = after.deps.some((d) =>
+ d.toLowerCase().includes('posthog'),
+ );
+ // Detect a PostHog env file directly on disk (more robust than a file diff:
+ // an .env may have pre-existed and only had keys appended).
+ const envFile = [...after.files]
+ .filter((f) => path.basename(f).startsWith('.env'))
+ .find((f) => {
+ try {
+ return /POSTHOG/i.test(fs.readFileSync(path.join(appDir, f), 'utf8'));
+ } catch {
+ return false;
+ }
+ });
+
+ log('');
+ log('================ RESULT ================');
+ log(`screen path : ${seen.join(' → ')}`);
+ log(`runPhase : ${store.session.runPhase}`);
+ log(`skillsComplete: ${store.session.skillsComplete}`);
+ log(`new deps : ${newDeps.join(', ') || '(none)'}`);
+ log(`posthog dep : ${hasPosthogDep}`);
+ log(`new files : ${newFiles.join(', ') || '(none)'}`);
+ log(`.env written: ${envFile ?? 'no'}`);
+
+ const integrated =
+ store.session.runPhase === RunPhase.Completed &&
+ (hasPosthogDep || !!envFile);
+ log(
+ `\n${
+ integrated ? '✓ FULL INTEGRATION LANDED' : '✗ integration incomplete'
+ }`,
+ );
+ log('========================================');
+
+ // Structured result for a harness/orchestrator (e.g. the workbench service) to
+ // assert on — the control plane's payoff over stdout-grepping classic --ci.
+ const result = {
+ integrated,
+ installDir: appDir,
+ screenPath: seen,
+ runPhase: store.session.runPhase,
+ skillsComplete: store.session.skillsComplete,
+ skillsPolicy: profile.skills,
+ skillsDeleted,
+ newDeps,
+ hasPosthogDep,
+ newFiles,
+ envFile: envFile ?? null,
+ };
+ const resultPath = process.env.E2E_RESULT_JSON;
+ if (resultPath) {
+ fs.writeFileSync(resultPath, JSON.stringify(result, null, 2));
+ log(`result json → ${resultPath}`);
+ }
+
+ // Save the run recording and tell the caller how to replay it.
+ recorder.stop();
+ const recordingPath =
+ process.env.E2E_RECORDING_JSON ??
+ `/tmp/wizard-e2e-${path.basename(appDir)}.recording.json`;
+ fs.writeFileSync(
+ recordingPath,
+ JSON.stringify(recorder.getRecording(), null, 2),
+ );
+ log(`recording (${recorder.frameCount} frames) → ${recordingPath}`);
+ log(`replay it: tsx scripts/replay-e2e.no-jest.ts ${recordingPath} --step`);
+
+ process.exit(integrated ? 0 : 1);
+}
+
+main().catch((e) => {
+ process.stderr.write(`\nE2E_FAIL: ${e?.stack ?? e}\n`);
+ process.exit(1);
+});
+
+// Keep the rsync helper reference so the import isn't dropped by tree-shaking
+// in case a caller wants to copy from here later.
+export const _copy = (from: string, to: string) =>
+ execFileSync('rsync', [
+ '-a',
+ '--exclude',
+ 'node_modules',
+ '--exclude',
+ '.git',
+ `${from}/`,
+ `${to}/`,
+ ]);
diff --git a/scripts/replay-e2e.no-jest.ts b/scripts/replay-e2e.no-jest.ts
new file mode 100644
index 00000000..ff9935ec
--- /dev/null
+++ b/scripts/replay-e2e.no-jest.ts
@@ -0,0 +1,71 @@
+/**
+ * Replay a recorded wizard run in the terminal.
+ *
+ * tsx scripts/replay-e2e.no-jest.ts [--step] [--delay ]
+ *
+ * --step advance frame-by-frame on Enter (default)
+ * --delay auto-play with between frames (e.g. --delay 1200)
+ */
+import { createInterface } from 'readline';
+import type { ProgramId } from '@ui/tui/router';
+import { loadRecording, renderFrame, frameHeader } from '@lib/ci-driver/replay';
+
+const ENTER_ALT = '\x1b[?1049h';
+const LEAVE_ALT = '\x1b[?1049l';
+const CLEAR = '\x1b[2J\x1b[H';
+
+async function main() {
+ const args = process.argv.slice(2);
+ const file = args.find((a) => !a.startsWith('-'));
+ if (!file) {
+ process.stderr.write(
+ 'usage: replay-e2e [--step] [--delay ]\n',
+ );
+ process.exit(2);
+ }
+ const delayArg = args.indexOf('--delay');
+ const autoDelay = delayArg !== -1 ? Number(args[delayArg + 1]) : null;
+ const step = autoDelay === null; // default to step unless --delay given
+
+ const rec = loadRecording(file);
+ const program = rec.meta.program as ProgramId;
+ const total = rec.frames.length;
+
+ process.stdout.write(ENTER_ALT);
+ const cleanup = () => process.stdout.write(LEAVE_ALT);
+ process.on('exit', cleanup);
+
+ const rl = step
+ ? createInterface({ input: process.stdin, output: process.stdout })
+ : null;
+ const ask = (q: string) =>
+ new Promise((res) => rl!.question(q, () => res()));
+ const sleep = (ms: number) => new Promise((res) => setTimeout(res, ms));
+
+ for (const frame of rec.frames) {
+ process.stdout.write(CLEAR);
+ process.stdout.write(`\x1b[2m${rec.meta.app ?? rec.meta.program}\x1b[0m\n`);
+ process.stdout.write(frameHeader(frame, total) + '\n\n');
+ process.stdout.write(renderFrame(frame, program) + '\n');
+ if (step) {
+ await ask(
+ `\n\x1b[2m[${
+ frame.seq + 1
+ }/${total}] Enter ▸ next · Ctrl-C ▸ quit\x1b[0m`,
+ );
+ } else {
+ await sleep(autoDelay!);
+ }
+ }
+
+ rl?.close();
+ cleanup();
+ process.stdout.write(`\nReplayed ${total} frames from ${file}\n`);
+ process.exit(0);
+}
+
+main().catch((e) => {
+ process.stdout.write(LEAVE_ALT);
+ process.stderr.write(`replay failed: ${e?.stack ?? e}\n`);
+ process.exit(1);
+});
diff --git a/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md b/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
new file mode 100644
index 00000000..a0c53b02
--- /dev/null
+++ b/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
@@ -0,0 +1,130 @@
+# Driving wizard e2e runs from an agent
+
+For a future AI agent asked to run a **real** wizard integration end-to-end and
+check it worked. This is the control-plane path (`wizard-ci --e2e`): it runs the
+WHOLE interactive flow headlessly via `wizard-ci-tools` and asserts on structured
+state — not the classic `--ci` (LoggingUI, stdout-grep, agent-only).
+
+It complements the human runbook `workbench/ci-verify-plan.md` (read that too —
+it has the key/region/build-channel facts). This doc is the agent-specific how-to.
+
+## The one command
+
+```bash
+cd /wizard-workbench
+WIZARD_PATH=/wizard \
+POSTHOG_PERSONAL_API_KEY="$(cat /test-api-key.txt)" \
+POSTHOG_REGION=us \
+ npx tsx services/wizard-ci/index.ts \
+ basic-integration/javascript-node/express-todo --e2e --project-id 228144
+```
+
+Pass `--keep-skills` to keep the installed skills (default deletes them). Swap the
+app path for any `apps/<...>` dir (e.g. `basic-integration/next-js/15-app-router-todo`).
+
+It copies the app to `/tmp`, runs the real agent against prod cloud, drives every
+screen, and prints `✓ E2E PASS` / `✗ E2E FAIL` + a `/tmp/wizard-e2e-.json`
+result. Exit 0 = pass. A run takes **~3-8 min** (gateway round-trips dominate).
+
+## The four things that bite an agent (and why)
+
+1. **You are running INSIDE a Claude Code session.** Its env
+ (`CLAUDECODE`, `CLAUDE_CODE_SDK_HAS_*_REFRESH`, `ANTHROPIC_*`, …) makes the
+ wizard's spawned agent defer auth to the host → `apiKeySource: none` → **401
+ auth-error**. The wizard-ci `--e2e` path strips these for the child, so the
+ one command above is safe. If you ever invoke the harness directly, strip them
+ yourself (see `STRIP_ENV` in `services/wizard-ci/e2e.ts`). A plain CI shell
+ doesn't have these, so it never hits this.
+
+2. **The test key is project-scoped.** `test-api-key.txt` only reads project
+ **228144** ("cookiesssss", US). Without `--project-id 228144` (or
+ `POSTHOG_WIZARD_PROJECT_ID`), bootstrap 403s ("Access denied while trying to
+ fetch project data"). The key is still valid — it authenticates and works as
+ the LLM gateway bearer; it just isn't scoped to the default team.
+
+3. **Never run on the real fixture.** Always a `/tmp` copy (the harness does
+ this). The runbook: after any accidental run on a real app, `git checkout` it.
+
+4. **Runs are sequential, and minutes long.** The agent log is a single shared
+ file (`/tmp/posthog-wizard.log`) — never run two at once. Launch with
+ `run_in_background: true` and watch with a Monitor on the output file; don't
+ block. Watch for: `screen →`, `assertions`, `E2E PASS/FAIL`, and `auth-error`.
+
+## How to read the result
+
+`/tmp/wizard-e2e-.json` (and the stdout assertions):
+
+| field | pass when |
+|---|---|
+| `runPhase` | `"completed"` (the agent finished) |
+| `hasPosthogDep` / `envFile` | a posthog dep was added and/or a `.env*` written |
+| `screenPath` | includes `keep-skills` (full flow walked) |
+| `skillsComplete` | `true` (run reached its done-signal) |
+| `skillsDeleted` | `true` when policy = delete |
+
+Also eyeball the `/tmp/` copy: `package.json` has `posthog-*`, an `.env*`
+has `POSTHOG_*`, and framework-specific files exist (e.g. Next.js
+`instrumentation-client.ts` with `posthog.init(...)`).
+
+## How it's built (so you can change it)
+
+```
+wizard-ci --e2e (workbench/services/wizard-ci/{index,e2e}.ts)
+ → spawns the wizard repo's headless harness (env-stripped, /tmp copy):
+ wizard/scripts/e2e-full-run.no-jest.ts
+ · real WizardStore + InkUI (never rendered) — no terminal, no browser
+ · real runAgent → prod gateway (phx key as bearer, --project-id)
+ · a concurrent WizardCiDriver drives each screen
+ → reads E2E_RESULT_JSON and asserts
+```
+
+The driver is `wizard/src/lib/ci-driver/` — `WizardCiDriver` (read_state /
+list_actions / perform_action), the screen→action registry, and the
+`wizard-ci-tools` MCP server.
+
+**To change what the run clicks**, edit the program's **e2e profile** — the UI
+choices live ON the program, not in the harness:
+`posthogIntegrationConfig.e2e` (`src/lib/programs/posthog-integration/index.ts`),
+typed by `WizardE2eProfile` (`src/lib/ci-driver/e2e-profile.ts`). The harness
+asks `decideE2eAction(state, profile)` what to commit on each screen. To make a
+*different* program e2e-drivable, give it an `e2e` profile too.
+
+**The flow is snapshot-tested** offline (no agent, deterministic):
+`src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts` golden-checks the
+(screen → decision) trace. If you change the flow or a profile, update with
+`jest -u`. This is the structured-state analog of the TUI ANSI screenshots in
+`scripts/__screenshots__/`.
+
+## Record & replay (verify a run after the fact)
+
+Every `--e2e` run is **recorded** as a timeline of key-moment frames — one each
+time the store/router changes (a route change, a task-list update, a new status
+line, a runPhase change, an overlay). The recording lands at
+`/tmp/wizard-e2e-.recording.json` and the run prints the replay command.
+
+Replay reconstructs each frame's store and renders the **real Ink screen** back
+to the terminal, so you (agent or human) can watch the run play back to verify it:
+
+```bash
+pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json # Enter ▸ step
+pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --delay 1200 # auto
+```
+
+As an agent you can't sit in the interactive stepper, but you can: (a) read the
+recording JSON directly (each frame has `triggers`, `screen`, `tasks`,
+`statusMessages`, redacted `session`) to assert the run hit the right moments, or
+(b) render specific frames to ANSI offline with `renderFrame()` from
+`src/lib/ci-driver/replay.ts`. The access token is redacted, so recordings are
+safe to share. Code: `recorder.ts` (capture) + `replay.ts` (render).
+
+## Driving it as a true LLM loop (optional)
+
+The `wizard-ci-tools` MCP server exposes `read_state` / `list_actions` /
+`perform_action` to an external driver. To have an LLM (not a scripted profile)
+play the user, connect a driver model to that server and loop
+`read_state → reason → perform_action`. Proven working: a gateway model called
+`perform_action {action:"confirm_setup"}` and advanced the real store. For
+deterministic CI, prefer the scripted profile above; reserve the LLM loop for
+fuzzing the flow. Auth caveat: a bare `query()` 401s on the follow-up turn
+through the `/wizard` gateway — route through the wizard's real `initializeAgent`
+for multi-turn (see `wizard-ci-tools-research.md`).
diff --git a/src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap b/src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
new file mode 100644
index 00000000..a7574afb
--- /dev/null
+++ b/src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
@@ -0,0 +1,93 @@
+// Jest Snapshot v1, https://goo.gl/fbAQLP
+
+exports[`e2e flow snapshot — posthog-integration Next.js (with a setup question) walks a stable path 1`] = `
+{
+ "profile": {
+ "ask": "first",
+ "healthCheck": "dismiss",
+ "mcp": "skip",
+ "setup": "first",
+ "skills": "delete",
+ "slack": "skip",
+ },
+ "program": "posthog-integration",
+ "trace": [
+ {
+ "action": "confirm_setup",
+ "screen": "intro",
+ },
+ {
+ "action": "dismiss_outage",
+ "screen": "health-check",
+ },
+ {
+ "action": "choose",
+ "screen": "setup",
+ },
+ {
+ "action": "(external)",
+ "screen": "auth",
+ },
+ {
+ "action": "(external)",
+ "screen": "run",
+ },
+ {
+ "action": "dismiss_outro",
+ "screen": "outro",
+ },
+ {
+ "action": "set_mcp_outcome",
+ "screen": "mcp",
+ },
+ {
+ "action": "dismiss_slack",
+ "screen": "slack-connect",
+ },
+ {
+ "action": "keep_skills",
+ "screen": "keep-skills",
+ },
+ ],
+}
+`;
+
+exports[`e2e flow snapshot — posthog-integration Node (no setup question) walks a stable path 1`] = `
+{
+ "program": "posthog-integration",
+ "trace": [
+ {
+ "action": "confirm_setup",
+ "screen": "intro",
+ },
+ {
+ "action": "dismiss_outage",
+ "screen": "health-check",
+ },
+ {
+ "action": "(external)",
+ "screen": "auth",
+ },
+ {
+ "action": "(external)",
+ "screen": "run",
+ },
+ {
+ "action": "dismiss_outro",
+ "screen": "outro",
+ },
+ {
+ "action": "set_mcp_outcome",
+ "screen": "mcp",
+ },
+ {
+ "action": "dismiss_slack",
+ "screen": "slack-connect",
+ },
+ {
+ "action": "keep_skills",
+ "screen": "keep-skills",
+ },
+ ],
+}
+`;
diff --git a/src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts b/src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts
new file mode 100644
index 00000000..b749ac03
--- /dev/null
+++ b/src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts
@@ -0,0 +1,96 @@
+/**
+ * E2E flow snapshot — the structured-state analog of Sarah's TUI ANSI
+ * screenshots (`scripts/cli-screenshots.mjs`, `__screenshots__/*.ans`).
+ *
+ * Her harness snapshots what a screen *renders*; this snapshots the
+ * deterministic control-plane *trace* a `wizard-ci --e2e` run walks: the
+ * ordered (screen → committed decision) path the program's `e2e` profile
+ * produces. It runs fully offline — the agent and auth are stubbed by injecting
+ * the external transitions the runner/agent would make — so it's deterministic
+ * and CI-safe, and it fails when the flow shape regresses (a screen appears or
+ * disappears, the order changes, or a profile decision changes).
+ *
+ * Update goldens with `jest -u` after an intentional flow change.
+ */
+
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase } from '@lib/wizard-session';
+import { Integration } from '@lib/constants';
+import { FRAMEWORK_REGISTRY } from '@lib/registry';
+import { WizardReadiness } from '@lib/health-checks/readiness';
+import { Program } from '@lib/programs/program-registry';
+import { ScreenId } from '@ui/tui/router';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { WizardCiDriver } from '../wizard-ci-driver';
+import { decideE2eAction, DEFAULT_E2E_PROFILE } from '../e2e-profile';
+
+/**
+ * Walk the program flow offline using its e2e profile, injecting the external
+ * transitions a real run gets from the runner (auth) and the agent (runPhase)
+ * and the health probe. Returns the ordered (screen, action) trace.
+ */
+function traceFlow(
+ integration: Integration,
+): Array<{ screen: string; action: string }> {
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store));
+ const session = buildSession({ installDir: '/tmp/e2e-snap', ci: true });
+ session.integration = integration;
+ session.frameworkConfig = FRAMEWORK_REGISTRY[integration];
+ store.session = session;
+
+ const driver = new WizardCiDriver(store);
+ const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+
+ const trace: Array<{ screen: string; action: string }> = [];
+ for (let guard = 0; guard < 40; guard++) {
+ const state = driver.readState();
+ const screen = state.currentScreen;
+ const decision = decideE2eAction(state, profile);
+ trace.push({ screen, action: decision.action?.id ?? '(external)' });
+
+ if (decision.action) {
+ driver.performAction(decision.action.id, decision.action.params ?? {});
+ }
+
+ // Inject the transitions a real run gets from outside the driver.
+ if (screen === ScreenId.HealthCheck) {
+ store.setReadinessResult({
+ decision: WizardReadiness.Yes,
+ health: {} as never,
+ reasons: [],
+ });
+ } else if (screen === ScreenId.Auth) {
+ store.setCredentials({
+ accessToken: 'phx_x',
+ projectApiKey: 'phc_x',
+ host: 'https://us.posthog.com',
+ projectId: 1,
+ });
+ } else if (screen === ScreenId.Run) {
+ store.setRunPhase(RunPhase.Completed);
+ }
+
+ if (decision.done || store.session.skillsComplete) break;
+ }
+ return trace;
+}
+
+describe('e2e flow snapshot — posthog-integration', () => {
+ it('Next.js (with a setup question) walks a stable path', () => {
+ expect({
+ program: 'posthog-integration',
+ profile: posthogIntegrationConfig.e2e,
+ trace: traceFlow(Integration.nextjs),
+ }).toMatchSnapshot();
+ });
+
+ it('Node (no setup question) walks a stable path', () => {
+ expect({
+ program: 'posthog-integration',
+ trace: traceFlow(Integration.javascriptNode),
+ }).toMatchSnapshot();
+ });
+});
diff --git a/src/lib/ci-driver/__tests__/recorder.test.ts b/src/lib/ci-driver/__tests__/recorder.test.ts
new file mode 100644
index 00000000..ccc20cb4
--- /dev/null
+++ b/src/lib/ci-driver/__tests__/recorder.test.ts
@@ -0,0 +1,103 @@
+/**
+ * Recorder unit test: the key-moment capture logic. (Frame *rendering* is
+ * validated via tsx — jest globally mocks `ink` — see replay.ts.)
+ */
+
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase } from '@lib/wizard-session';
+import { Integration } from '@lib/constants';
+import { FRAMEWORK_REGISTRY } from '@lib/registry';
+import { WizardReadiness } from '@lib/health-checks/readiness';
+import { Program } from '@lib/programs/program-registry';
+import { ScreenId } from '@ui/tui/router';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { WizardCiDriver } from '../wizard-ci-driver';
+import { decideE2eAction, DEFAULT_E2E_PROFILE } from '../e2e-profile';
+import { WizardRecorder } from '../recorder';
+
+function recordedRun() {
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store));
+ const session = buildSession({ installDir: '/tmp/rec', ci: true });
+ session.integration = Integration.nextjs;
+ session.frameworkConfig = FRAMEWORK_REGISTRY[Integration.nextjs];
+ store.session = session;
+
+ let clock = 0;
+ const rec = new WizardRecorder(
+ store,
+ { program: 'posthog-integration', app: 'demo' },
+ () => (clock += 500),
+ );
+ rec.start();
+
+ const driver = new WizardCiDriver(store);
+ const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+ for (let i = 0; i < 40; i++) {
+ const state = driver.readState();
+ const d = decideE2eAction(state, profile);
+ if (d.action) driver.performAction(d.action.id, d.action.params ?? {});
+ if (state.currentScreen === ScreenId.HealthCheck) {
+ store.setReadinessResult({
+ decision: WizardReadiness.Yes,
+ health: {} as never,
+ reasons: [],
+ });
+ } else if (state.currentScreen === ScreenId.Auth) {
+ store.setCredentials({
+ accessToken: 'phx_topsecret',
+ projectApiKey: 'phc_x',
+ host: 'https://us.posthog.com',
+ projectId: 1,
+ });
+ } else if (state.currentScreen === ScreenId.Run) {
+ store.pushStatus('Installing posthog-js…');
+ store.setTasks([
+ { label: 'Install SDK', status: 'completed' as never, done: true },
+ ]);
+ store.setRunPhase(RunPhase.Completed);
+ }
+ if (d.done || store.session.skillsComplete) break;
+ }
+ rec.stop();
+ return rec.getRecording();
+}
+
+describe('WizardRecorder', () => {
+ it('captures a frame at each key moment, labelled by trigger', () => {
+ const rec = recordedRun();
+ const triggers = rec.frames.map((f) => f.triggers.join('+'));
+
+ // First frame is the initial snapshot; every route lands a 'screen' frame.
+ expect(triggers[0]).toBe('start');
+ expect(rec.frames.map((f) => f.screen)).toEqual(
+ expect.arrayContaining([
+ ScreenId.Intro,
+ ScreenId.HealthCheck,
+ ScreenId.Setup,
+ ScreenId.Auth,
+ ScreenId.Run,
+ ScreenId.Outro,
+ ScreenId.Mcp,
+ ScreenId.SlackConnect,
+ ScreenId.KeepSkills,
+ ]),
+ );
+ // Task + status updates during the run are their own key moments.
+ expect(triggers).toContain('tasks');
+ expect(triggers).toContain('status');
+ // Frames carry monotonic timestamps.
+ expect(rec.frames.map((f) => f.ms)).toEqual(
+ [...rec.frames.map((f) => f.ms)].sort((a, b) => a - b),
+ );
+ });
+
+ it('redacts the access token from the recording', () => {
+ const rec = recordedRun();
+ expect(JSON.stringify(rec)).not.toContain('phx_topsecret');
+ const authed = rec.frames.find((f) => f.session.credentials);
+ expect(authed?.session.credentials?.accessToken).toBe('phx_***redacted***');
+ });
+});
diff --git a/src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts b/src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts
new file mode 100644
index 00000000..230923b5
--- /dev/null
+++ b/src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts
@@ -0,0 +1,183 @@
+/**
+ * Control-plane test: drive a REAL WizardStore through the full integration
+ * screen sequence using only the WizardCiDriver — proving read_state is a
+ * truthful projection of router-resolved state and that perform_action commits
+ * cause the same transitions the interactive UI would.
+ *
+ * The agent/auth steps are simulated by committing through the same store the
+ * runner mutates (the SDK is mocked in jest); every *human* decision goes
+ * through the driver.
+ */
+
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase, McpOutcome } from '@lib/wizard-session';
+import { Integration } from '@lib/constants';
+import { FRAMEWORK_REGISTRY } from '@lib/registry';
+import { WizardReadiness } from '@lib/health-checks/readiness';
+import { ScreenId, Overlay } from '@ui/tui/router';
+import { Program } from '@lib/programs/program-registry';
+import { WizardCiDriver, UnknownActionError } from '../wizard-ci-driver';
+import { ACTION_REGISTRY, NO_ACTION_SCREENS } from '../action-registry';
+
+function freshStore(): WizardStore {
+ const store = new WizardStore(Program.PostHogIntegration);
+ // Headless: a real store + InkUI (which only forwards to the store), no Ink
+ // render. setUI so any getUI() path the store touches resolves.
+ setUI(new InkUI(store));
+ const session = buildSession({
+ installDir: '/tmp/ci-driver-test',
+ ci: true, // OAuth-bypass + ai-opt-in auto-consent semantics
+ });
+ session.integration = Integration.nextjs;
+ session.frameworkConfig = FRAMEWORK_REGISTRY[Integration.nextjs];
+ store.session = session;
+ return store;
+}
+
+const cleanReadiness = {
+ decision: WizardReadiness.Yes,
+ health: {} as never,
+ reasons: [] as string[],
+};
+
+describe('WizardCiDriver — full integration flow', () => {
+ it('walks intro → setup → run → outro → mcp → slack → keep-skills', () => {
+ const store = freshStore();
+ const driver = new WizardCiDriver(store);
+
+ // 1. Intro
+ expect(driver.readState().currentScreen).toBe(ScreenId.Intro);
+ expect(driver.listActions().map((a) => a.id)).toContain('confirm_setup');
+ driver.performAction('confirm_setup');
+
+ // 2. Health check — blocks until a readiness result lands (mirrors onInit
+ // probe). Simulate a clean probe; router advances past it.
+ expect(driver.readState().currentScreen).toBe(ScreenId.HealthCheck);
+ store.setReadinessResult(cleanReadiness);
+
+ // 3. Setup — Next.js asks for the router. The driver reads the question
+ // off read_state and commits the answer via `choose`.
+ const state = driver.readState();
+ expect(state.currentScreen).toBe(ScreenId.Setup);
+ expect(state.setupQuestions).toHaveLength(1);
+ expect(state.setupQuestions[0].key).toBe('router');
+ const appValue = state.setupQuestions[0].options[0].value;
+ driver.performAction('choose', { key: 'router', value: appValue });
+
+ // 4. Auth — no user action; the runner sets credentials headlessly using
+ // the phx key. Simulate that commit.
+ expect(driver.readState().currentScreen).toBe(ScreenId.Auth);
+ store.setCredentials({
+ accessToken: 'phx_secret_should_not_leak',
+ projectApiKey: 'phc_public',
+ host: 'https://us.posthog.com',
+ projectId: 42,
+ });
+
+ // 5. ai-opt-in auto-completes (ci=true), so we land on Run. The agent runs
+ // here; simulate it finishing.
+ expect(driver.readState().currentScreen).toBe(ScreenId.Run);
+ store.setRunPhase(RunPhase.Running);
+ store.setRunPhase(RunPhase.Completed);
+
+ // 6. Outro
+ expect(driver.readState().currentScreen).toBe(ScreenId.Outro);
+ driver.performAction('dismiss_outro');
+
+ // 7. MCP
+ expect(driver.readState().currentScreen).toBe(ScreenId.Mcp);
+ driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
+ expect(store.session.mcpOutcome).toBe(McpOutcome.Skipped);
+
+ // 8. Slack
+ expect(driver.readState().currentScreen).toBe(ScreenId.SlackConnect);
+ driver.performAction('dismiss_slack');
+
+ // 9. Keep skills — terminal commit.
+ expect(driver.readState().currentScreen).toBe(ScreenId.KeepSkills);
+ const done = driver.performAction('keep_skills', { kept: true });
+
+ // keep-skills is the terminal step: it has no isComplete predicate, so the
+ // router rests on it. Completion is signalled by skillsComplete — the exact
+ // condition run-wizard.ts awaits to end the run.
+ expect(store.session.skillsComplete).toBe(true);
+ expect(done.currentScreen).toBe(ScreenId.KeepSkills);
+ });
+
+ it('read_state is a truthful projection and never leaks the access token', () => {
+ const store = freshStore();
+ const driver = new WizardCiDriver(store);
+ store.setCredentials({
+ accessToken: 'phx_secret_should_not_leak',
+ projectApiKey: 'phc_public',
+ host: 'https://us.posthog.com',
+ projectId: 7,
+ });
+ const state = driver.readState();
+ // currentScreen always equals what the router resolves.
+ expect(state.currentScreen).toBe(store.currentScreen);
+ expect(state.session.hasCredentials).toBe(true);
+ expect(state.session.projectId).toBe(7);
+ // No raw secret anywhere in the serialized snapshot.
+ expect(JSON.stringify(state)).not.toContain('phx_secret_should_not_leak');
+ });
+
+ it('rejects actions that are not legal on the current screen', () => {
+ const store = freshStore();
+ const driver = new WizardCiDriver(store);
+ expect(driver.readState().currentScreen).toBe(ScreenId.Intro);
+ expect(() => driver.performAction('keep_skills')).toThrow(
+ UnknownActionError,
+ );
+ });
+});
+
+describe('WizardCiDriver — wizard_ask overlay', () => {
+ it('answers a pending question through the driver, resolving the agent promise', async () => {
+ const store = freshStore();
+ const driver = new WizardCiDriver(store);
+
+ // The agent (via the ask bridge) opens a question and awaits the answers.
+ const answersPromise = store.requestQuestion({
+ id: 'q1',
+ source: 'integration-nextjs',
+ questions: [
+ {
+ id: 'router',
+ prompt: 'Which router?',
+ kind: 'single',
+ options: [
+ { label: 'App', value: 'app' },
+ { label: 'Pages', value: 'pages' },
+ ],
+ },
+ ],
+ });
+
+ const state = driver.readState();
+ expect(state.currentScreen).toBe(Overlay.WizardAsk);
+ expect(state.hasOverlay).toBe(true);
+ expect(state.pendingQuestion?.questions[0].id).toBe('router');
+ expect(driver.listActions().map((a) => a.id)).toContain('answer_question');
+
+ // The driver commits the complete answer map directly — skipping the
+ // per-question keystroke walk that lives in React-local state.
+ driver.performAction('answer_question', { answers: { router: 'app' } });
+
+ await expect(answersPromise).resolves.toEqual({ router: 'app' });
+ // Overlay popped; back to the underlying screen.
+ expect(driver.readState().currentScreen).not.toBe(Overlay.WizardAsk);
+ });
+});
+
+describe('action registry exhaustiveness', () => {
+ it('every screen and overlay is either actionable or explicitly no-action', () => {
+ const allScreens = [...Object.values(ScreenId), ...Object.values(Overlay)];
+ const uncovered = allScreens.filter(
+ (s) => !(s in ACTION_REGISTRY) && !NO_ACTION_SCREENS.has(s),
+ );
+ expect(uncovered).toEqual([]);
+ });
+});
diff --git a/src/lib/ci-driver/action-registry.ts b/src/lib/ci-driver/action-registry.ts
new file mode 100644
index 00000000..57e9f080
--- /dev/null
+++ b/src/lib/ci-driver/action-registry.ts
@@ -0,0 +1,270 @@
+/**
+ * Screen → action registry for the CI driver.
+ *
+ * Maps every screen/overlay to the set of *commit* actions a user could
+ * perform on it — and, for each, the single WizardStore setter/resolver that
+ * commit goes through. This is the actuation half of the wizard-ci-tools
+ * surface: instead of injecting keystrokes, a harness names an action and the
+ * driver invokes the same store method the Ink screen's keyboard handler would.
+ *
+ * Discipline mirrors screen-registry.tsx: one entry per screen, kept exhaustive
+ * by a test over the ScreenId/Overlay enums. No product knowledge leaks in —
+ * actions speak only in store setters and generic params.
+ */
+
+import type { WizardStore } from '@ui/tui/store';
+import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
+import { McpOutcome } from '@lib/wizard-session';
+import type { AskAnswers } from '@lib/wizard-session';
+
+/** One commit action legal on a given screen. */
+export interface DriverAction {
+ /** Stable action id named in perform_action. */
+ id: string;
+ /** One-line description of what committing this does. */
+ description: string;
+ /**
+ * Parameter name → human/type hint. Absent = no params. The driver
+ * validates presence of required params before applying.
+ */
+ params?: Record;
+ /** Apply the commit by calling exactly one store setter/resolver. */
+ apply: (store: WizardStore, params: Record) => void;
+}
+
+/** Thrown when perform_action references a missing required param. */
+export class MissingParamError extends Error {
+ constructor(action: string, param: string) {
+ super(`Action "${action}" requires param "${param}".`);
+ this.name = 'MissingParamError';
+ }
+}
+
+function requireString(
+ action: string,
+ params: Record,
+ key: string,
+): string {
+ const v = params[key];
+ if (typeof v !== 'string' || v.length === 0) {
+ throw new MissingParamError(action, key);
+ }
+ return v;
+}
+
+/**
+ * Screens with no committable user action (the runner or agent advances them):
+ * auth (runner sets credentials), run (agent sets runPhase), ai-opt-in (gated on
+ * org approval / ci auto-consent), exit, and the no-dismiss terminal overlays.
+ * Listed explicitly so the exhaustiveness test can tell "intentionally empty"
+ * from "forgotten".
+ */
+export const NO_ACTION_SCREENS: ReadonlySet = new Set([
+ ScreenId.Auth,
+ ScreenId.Run,
+ ScreenId.AiOptIn,
+ ScreenId.Exit,
+ ScreenId.AuditRun,
+ ScreenId.DoctorReport,
+ ScreenId.SourceMapsOutro,
+ ScreenId.AuditOutro,
+ Overlay.ManagedSettings,
+ Overlay.AuthError,
+ Overlay.SessionTimeout,
+]);
+
+/**
+ * Intro-style screens whose only action is "confirm and continue", committing
+ * the same `setupConfirmed` flag the IntroScreen sets. Several programs reuse
+ * this shape, so they share one action via this helper.
+ */
+const confirmSetupAction: DriverAction = {
+ id: 'confirm_setup',
+ description: 'Confirm the intro and continue (sets setupConfirmed).',
+ apply: (store) => store.completeSetup(),
+};
+
+export const ACTION_REGISTRY: Partial> = {
+ // ── Program intros — confirm & continue ───────────────────────────────
+ [ScreenId.Intro]: [confirmSetupAction],
+ [ScreenId.RevenueIntro]: [confirmSetupAction],
+ [ScreenId.SourceMapsIntro]: [confirmSetupAction],
+ [ScreenId.MigrationIntro]: [confirmSetupAction],
+ [ScreenId.AgentSkillIntro]: [confirmSetupAction],
+ [ScreenId.AuditIntro]: [confirmSetupAction],
+ [ScreenId.DoctorIntro]: [confirmSetupAction],
+
+ // ── Health check — dismiss a blocking outage ──────────────────────────
+ [ScreenId.HealthCheck]: [
+ {
+ id: 'dismiss_outage',
+ description: 'Dismiss the blocking outage screen and continue.',
+ apply: (store) => store.dismissOutage(),
+ },
+ ],
+
+ // ── Framework disambiguation ──────────────────────────────────────────
+ [ScreenId.Setup]: [
+ {
+ id: 'choose',
+ description:
+ 'Answer one setup question by committing a framework-context value. ' +
+ 'Read read_state.setupQuestions for the key and allowed values.',
+ params: { key: 'setup question key', value: 'chosen option value' },
+ apply: (store, params) => {
+ const key = requireString('choose', params, 'key');
+ const value = requireString('choose', params, 'value');
+ store.setFrameworkContext(key, value);
+ },
+ },
+ ],
+
+ // ── Outro ─────────────────────────────────────────────────────────────
+ [ScreenId.Outro]: [
+ {
+ id: 'dismiss_outro',
+ description: 'Dismiss the outro and advance to the MCP step.',
+ apply: (store) => store.setOutroDismissed(),
+ },
+ ],
+
+ // ── MCP install ───────────────────────────────────────────────────────
+ [ScreenId.Mcp]: [
+ {
+ id: 'set_mcp_outcome',
+ description:
+ 'Complete the MCP step. outcome ∈ {installed, skipped}; clients optional.',
+ params: {
+ outcome: '"installed" | "skipped"',
+ clients: 'string[] (optional)',
+ },
+ apply: (store, params) => {
+ const raw = (params.outcome as string) ?? 'skipped';
+ const outcome =
+ raw === 'installed' ? McpOutcome.Installed : McpOutcome.Skipped;
+ const clients = Array.isArray(params.clients)
+ ? (params.clients as string[])
+ : [];
+ store.setMcpComplete(outcome, clients);
+ },
+ },
+ ],
+ [ScreenId.McpAdd]: [
+ {
+ id: 'set_mcp_outcome',
+ description: 'Complete the standalone MCP-add flow.',
+ params: { outcome: '"installed" | "skipped"' },
+ apply: (store, params) => {
+ const raw = (params.outcome as string) ?? 'skipped';
+ store.setMcpComplete(
+ raw === 'installed' ? McpOutcome.Installed : McpOutcome.Skipped,
+ );
+ },
+ },
+ ],
+ [ScreenId.McpRemove]: [
+ {
+ id: 'set_mcp_outcome',
+ description: 'Complete the standalone MCP-remove flow.',
+ params: { outcome: '"installed" | "skipped"' },
+ apply: (store, params) => {
+ const raw = (params.outcome as string) ?? 'skipped';
+ store.setMcpComplete(
+ raw === 'installed' ? McpOutcome.Installed : McpOutcome.Skipped,
+ );
+ },
+ },
+ ],
+ [ScreenId.McpSuggestedPrompts]: [
+ {
+ id: 'dismiss',
+ description: 'Dismiss the suggested-prompts step.',
+ apply: (store) => store.setMcpSuggestedPromptsDismissed(),
+ },
+ ],
+
+ // ── Slack ─────────────────────────────────────────────────────────────
+ [ScreenId.SlackConnect]: [
+ {
+ id: 'dismiss_slack',
+ description: 'Skip or finish the Connect-Slack step.',
+ apply: (store) => store.setSlackStepDismissed(),
+ },
+ {
+ id: 'set_slack_connected',
+ description: 'Mark Slack as connected (then dismiss to advance).',
+ params: { connected: 'boolean' },
+ apply: (store, params) =>
+ store.setSlackConnected(params.connected !== false),
+ },
+ ],
+
+ // ── Keep skills (terminal step of the integration flow) ───────────────
+ [ScreenId.KeepSkills]: [
+ {
+ id: 'keep_skills',
+ description:
+ 'Decide whether to keep installed skills; completes the run.',
+ params: { kept: 'boolean (default true)' },
+ apply: (store, params) => store.setSkillsComplete(params.kept !== false),
+ },
+ ],
+
+ // ── Overlays ──────────────────────────────────────────────────────────
+ [Overlay.WizardAsk]: [
+ {
+ id: 'answer_question',
+ description:
+ 'Resolve the pending wizard_ask request. Supply a complete answers ' +
+ 'map: { [questionId]: string | string[] }. See read_state.pendingQuestion.',
+ params: { answers: 'Record' },
+ apply: (store, params) => {
+ const answers = (params.answers ?? {}) as AskAnswers;
+ store.resolvePendingQuestion(answers);
+ },
+ },
+ {
+ id: 'cancel_question',
+ description: 'Cancel the pending wizard_ask request (sentinel answers).',
+ apply: (store) => store.cancelPendingQuestion(),
+ },
+ ],
+ [Overlay.SettingsOverride]: [
+ {
+ id: 'backup_and_fix',
+ description: 'Back up and fix conflicting .claude/settings.json.',
+ apply: (store) => {
+ store.backupAndFixSettingsOverride();
+ },
+ },
+ ],
+ [Overlay.PortConflict]: [
+ {
+ id: 'resolve_port_conflict',
+ description:
+ 'Dismiss the port-conflict overlay and retry the OAuth port loop.',
+ apply: (store) => store.resolvePortConflict(),
+ },
+ ],
+ [Overlay.ManualAuthCode]: [
+ {
+ id: 'submit_auth_code',
+ description: 'Submit a manually-entered OAuth authorization code.',
+ params: { code: 'authorization code' },
+ apply: (store, params) =>
+ store.submitManualAuthCode(
+ requireString('submit_auth_code', params, 'code'),
+ ),
+ },
+ {
+ id: 'dismiss_auth_code',
+ description: 'Dismiss the manual auth-code overlay without submitting.',
+ apply: (store) => store.dismissManualAuthCode(),
+ },
+ ],
+};
+
+/** Actions legal on the given screen — empty array if none. */
+export function actionsForScreen(screen: ScreenName): DriverAction[] {
+ return ACTION_REGISTRY[screen] ?? [];
+}
diff --git a/src/lib/ci-driver/e2e-profile.ts b/src/lib/ci-driver/e2e-profile.ts
new file mode 100644
index 00000000..9ed909ee
--- /dev/null
+++ b/src/lib/ci-driver/e2e-profile.ts
@@ -0,0 +1,154 @@
+/**
+ * WizardE2eProfile — a program's declarative e2e "test definition": the
+ * UI choices a headless e2e run should make at each decision point.
+ *
+ * This is the test format the design discipline wants: the *choices* are
+ * product knowledge about a program's flow, so they live on the program's
+ * `ProgramConfig.e2e`, not hardcoded in the harness. The harness is generic —
+ * it reads the profile and asks {@link decideE2eAction} what to commit on the
+ * current screen. Add a profile to a program to make it e2e-drivable.
+ */
+
+import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
+import type { CiState } from './wizard-ci-driver.js';
+
+/** Which option to pick for a setup disambiguation question. */
+export type SetupChoice = 'first' | 'last';
+
+export interface WizardE2eProfile {
+ /** Setup disambiguation (e.g. Next.js router): which option to commit. */
+ setup: SetupChoice;
+ /**
+ * Health-check screen: `dismiss` continues even if the probe flags an
+ * outage (sets outageDismissed); `wait` lets only a clean probe through.
+ */
+ healthCheck: 'dismiss' | 'wait';
+ /** Post-agent MCP-install step. */
+ mcp: 'skip' | 'install';
+ /** Connect-Slack step. */
+ slack: 'skip';
+ /** Keep or delete the wizard-installed skills at the end. */
+ skills: 'keep' | 'delete';
+ /** Default answer strategy for an agent `wizard_ask` overlay. */
+ ask: 'first';
+}
+
+/** Happy-path default: take every screen forward, leave nothing behind. */
+export const DEFAULT_E2E_PROFILE: WizardE2eProfile = {
+ setup: 'first',
+ healthCheck: 'dismiss',
+ mcp: 'skip',
+ slack: 'skip',
+ skills: 'delete',
+ ask: 'first',
+};
+
+/** What the harness should do for the current screen. */
+export interface E2eDecision {
+ /** A driver action to commit, if any. */
+ action?: { id: string; params?: Record };
+ /** Set on the keep-skills screen — the orchestrator does the fs deletion. */
+ skillsPolicy?: 'keep' | 'delete';
+ /** True once the terminal commit has been made. */
+ done?: boolean;
+ /** No action — wait for an external transition (probe, auth, agent run). */
+ wait?: boolean;
+}
+
+/**
+ * Map the current screen + profile to the commit to make. Pure: no store, no
+ * fs — the caller applies the returned action via the driver and handles
+ * `skillsPolicy` itself. Returns `{ wait: true }` for screens the runner/agent
+ * advances on their own (auth, run, ai-opt-in, a clean health probe).
+ */
+export function decideE2eAction(
+ state: CiState,
+ profile: WizardE2eProfile,
+): E2eDecision {
+ switch (state.currentScreen) {
+ case ScreenId.Intro:
+ case ScreenId.RevenueIntro:
+ case ScreenId.MigrationIntro:
+ case ScreenId.AgentSkillIntro:
+ case ScreenId.AuditIntro:
+ case ScreenId.SourceMapsIntro:
+ case ScreenId.DoctorIntro:
+ return { action: { id: 'confirm_setup' } };
+
+ case ScreenId.HealthCheck:
+ return profile.healthCheck === 'dismiss'
+ ? { action: { id: 'dismiss_outage' } }
+ : { wait: true };
+
+ case ScreenId.Setup: {
+ const q = state.setupQuestions[0];
+ if (!q) return { wait: true };
+ const opt =
+ profile.setup === 'last'
+ ? q.options[q.options.length - 1]
+ : q.options[0];
+ return {
+ action: { id: 'choose', params: { key: q.key, value: opt.value } },
+ };
+ }
+
+ case ScreenId.Outro:
+ return { action: { id: 'dismiss_outro' } };
+
+ case ScreenId.Mcp:
+ return {
+ action: {
+ id: 'set_mcp_outcome',
+ params: {
+ outcome: profile.mcp === 'install' ? 'installed' : 'skipped',
+ },
+ },
+ };
+
+ case ScreenId.McpSuggestedPrompts:
+ return { action: { id: 'dismiss' } };
+
+ case ScreenId.SlackConnect:
+ return { action: { id: 'dismiss_slack' } };
+
+ case ScreenId.KeepSkills:
+ return {
+ action: {
+ id: 'keep_skills',
+ params: { kept: profile.skills === 'keep' },
+ },
+ skillsPolicy: profile.skills,
+ done: true,
+ };
+
+ case Overlay.WizardAsk: {
+ const q = state.pendingQuestion?.questions[0];
+ if (!q) return { wait: true };
+ // 'first': first option for single/multi, sentinel for free text.
+ const answer = q.options?.[0]?.value ?? 'e2e';
+ return {
+ action: {
+ id: 'answer_question',
+ params: { answers: { [q.id]: answer } },
+ },
+ };
+ }
+
+ // auth (runner), run (agent), ai-opt-in (ci), exit, terminal overlays.
+ default:
+ return { wait: true };
+ }
+}
+
+/** Screens this profile knows how to act on — for completeness checks/tests. */
+export const E2E_DRIVABLE_SCREENS: readonly ScreenName[] = [
+ ScreenId.Intro,
+ ScreenId.HealthCheck,
+ ScreenId.Setup,
+ ScreenId.Outro,
+ ScreenId.Mcp,
+ ScreenId.McpSuggestedPrompts,
+ ScreenId.SlackConnect,
+ ScreenId.KeepSkills,
+ Overlay.WizardAsk,
+];
diff --git a/src/lib/ci-driver/recorder.ts b/src/lib/ci-driver/recorder.ts
new file mode 100644
index 00000000..6e65c190
--- /dev/null
+++ b/src/lib/ci-driver/recorder.ts
@@ -0,0 +1,159 @@
+/**
+ * WizardRecorder — records a wizard run as a timeline of frames, one per "key
+ * moment", so the run can be replayed in the terminal later (by an agent or a
+ * human) to verify what happened.
+ *
+ * Key moments are store/router changes: a route (screen) change, a runPhase
+ * change, a task-list update, a new status line, an event-plan update, or an
+ * overlay push/pop. The recorder subscribes to the store's single version
+ * counter (the same signal React uses) and snapshots state whenever one of
+ * those changes — so the recording mirrors exactly what the live TUI would have
+ * repainted on.
+ *
+ * Each frame stores the (secret-redacted) session plus tasks/status/event-plan,
+ * which is enough for {@link ../replay} to reconstruct a throwaway store and
+ * render the real Ink screen back to ANSI.
+ */
+
+import type { WizardStore } from '@ui/tui/store';
+import type { ScreenName } from '@ui/tui/router';
+import type { WizardSession } from '@lib/wizard-session';
+
+/** The change(s) that triggered a frame. */
+export type FrameTrigger =
+ | 'start'
+ | 'screen'
+ | 'runPhase'
+ | 'tasks'
+ | 'status'
+ | 'eventPlan'
+ | 'overlay';
+
+export interface RecordedFrame {
+ seq: number;
+ /** ms since the recording started. */
+ ms: number;
+ /** Which key moment(s) produced this frame. */
+ triggers: FrameTrigger[];
+ screen: ScreenName;
+ hasOverlay: boolean;
+ /** Session snapshot with the access token redacted. */
+ session: WizardSession;
+ tasks: Array<{
+ label: string;
+ status: string;
+ activeForm?: string;
+ done: boolean;
+ }>;
+ statusMessages: string[];
+ eventPlan: Array<{ name: string; description: string }>;
+}
+
+export interface Recording {
+ meta: { program: string; app?: string; startedAtMs: number };
+ frames: RecordedFrame[];
+}
+
+/** Redact the access token — recordings are shareable artifacts. */
+function redactSession(session: WizardSession): WizardSession {
+ if (!session.credentials) return session;
+ return {
+ ...session,
+ credentials: { ...session.credentials, accessToken: 'phx_***redacted***' },
+ };
+}
+
+export class WizardRecorder {
+ private frames: RecordedFrame[] = [];
+ private seq = 0;
+ private startMs: number;
+ private unsub: (() => void) | null = null;
+
+ private prevScreen: ScreenName;
+ private prevRunPhase: WizardSession['runPhase'];
+ private prevTasks: unknown;
+ private prevStatus: unknown;
+ private prevEventPlan: unknown;
+ private prevOverlay: boolean;
+
+ constructor(
+ private readonly store: WizardStore,
+ private readonly meta: { program: string; app?: string },
+ private readonly now: () => number = () => Date.now(),
+ ) {
+ this.startMs = this.now();
+ this.prevScreen = store.currentScreen;
+ this.prevRunPhase = store.session.runPhase;
+ this.prevTasks = store.tasks;
+ this.prevStatus = store.statusMessages;
+ this.prevEventPlan = store.eventPlan;
+ this.prevOverlay = store.router.hasOverlay;
+ }
+
+ /** Begin recording: snapshot the initial frame and subscribe to changes. */
+ start(): void {
+ this.capture(['start']);
+ this.unsub = this.store.subscribe(() => this.onChange());
+ }
+
+ /** Stop recording. */
+ stop(): void {
+ this.unsub?.();
+ this.unsub = null;
+ }
+
+ private onChange(): void {
+ const s = this.store;
+ const triggers: FrameTrigger[] = [];
+ // The store replaces these by reference on every mutation, so identity
+ // comparison detects each kind of key moment.
+ if (s.currentScreen !== this.prevScreen) triggers.push('screen');
+ if (s.session.runPhase !== this.prevRunPhase) triggers.push('runPhase');
+ if (s.tasks !== this.prevTasks) triggers.push('tasks');
+ if (s.statusMessages !== this.prevStatus) triggers.push('status');
+ if (s.eventPlan !== this.prevEventPlan) triggers.push('eventPlan');
+ if (s.router.hasOverlay !== this.prevOverlay) triggers.push('overlay');
+
+ this.prevScreen = s.currentScreen;
+ this.prevRunPhase = s.session.runPhase;
+ this.prevTasks = s.tasks;
+ this.prevStatus = s.statusMessages;
+ this.prevEventPlan = s.eventPlan;
+ this.prevOverlay = s.router.hasOverlay;
+
+ if (triggers.length > 0) this.capture(triggers);
+ }
+
+ private capture(triggers: FrameTrigger[]): void {
+ this.frames.push({
+ seq: this.seq++,
+ ms: this.now() - this.startMs,
+ triggers,
+ screen: this.store.currentScreen,
+ hasOverlay: this.store.router.hasOverlay,
+ session: redactSession(this.store.session),
+ tasks: this.store.tasks.map((t) => ({
+ label: t.label,
+ status: t.status,
+ activeForm: t.activeForm,
+ done: t.done,
+ })),
+ statusMessages: [...this.store.statusMessages],
+ eventPlan: this.store.eventPlan.map((e) => ({
+ name: e.name,
+ description: e.description,
+ })),
+ });
+ }
+
+ getRecording(): Recording {
+ return {
+ meta: { ...this.meta, startedAtMs: this.startMs },
+ frames: this.frames,
+ };
+ }
+
+ get frameCount(): number {
+ return this.frames.length;
+ }
+}
diff --git a/src/lib/ci-driver/replay.ts b/src/lib/ci-driver/replay.ts
new file mode 100644
index 00000000..d5729413
--- /dev/null
+++ b/src/lib/ci-driver/replay.ts
@@ -0,0 +1,74 @@
+/**
+ * Replay a {@link Recording} in the terminal: for each recorded frame,
+ * reconstruct a throwaway store from the frame's state and render the REAL Ink
+ * screen back to ANSI — so an agent or a human sees the run play back exactly as
+ * the live TUI drew it, paused at each key moment.
+ *
+ * Rendering is offline against a disposable store, so screen effects (detection,
+ * prefetch) fire harmlessly against the recorded state and never touch the real
+ * run.
+ */
+
+import { readFileSync } from 'fs';
+import type { ReactElement } from 'react';
+import { render } from 'ink-testing-library';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { TaskStatus } from '@ui/wizard-ui';
+import type { ProgramId } from '@ui/tui/router';
+import { createScreens, createServices } from '@ui/tui/screen-registry';
+import type { Recording, RecordedFrame } from './recorder.js';
+
+export function loadRecording(path: string): Recording {
+ return JSON.parse(readFileSync(path, 'utf8')) as Recording;
+}
+
+/**
+ * Render one frame's screen to an ANSI string by rebuilding a disposable store
+ * from the frame and mounting the real screen component. Falls back to a text
+ * summary if a screen throws on render.
+ */
+export function renderFrame(frame: RecordedFrame, program: ProgramId): string {
+ const store = new WizardStore(program);
+ setUI(new InkUI(store));
+ store.session = frame.session;
+ store.setTasks(
+ frame.tasks.map((t) => ({
+ label: t.label,
+ activeForm: t.activeForm,
+ status: (t.status as TaskStatus) ?? TaskStatus.Pending,
+ done: t.done,
+ })),
+ );
+ if (frame.eventPlan.length > 0) store.setEventPlan(frame.eventPlan);
+ for (const m of frame.statusMessages) store.pushStatus(m);
+
+ try {
+ const services = createServices(store);
+ const screens = createScreens(store, services);
+ const node = screens[frame.screen];
+ if (!node) return `(no component registered for screen "${frame.screen}")`;
+ const { lastFrame, unmount } = render(node as ReactElement);
+ const out = lastFrame() ?? '';
+ unmount();
+ return out;
+ } catch (err) {
+ return `(render failed: ${
+ err instanceof Error ? err.message : String(err)
+ })`;
+ }
+}
+
+/** One-line header summarizing a frame. */
+export function frameHeader(frame: RecordedFrame, total: number): string {
+ const secs = (frame.ms / 1000).toFixed(1);
+ const tasks = frame.tasks.length
+ ? ` · tasks ${frame.tasks.filter((t) => t.done).length}/${
+ frame.tasks.length
+ }`
+ : '';
+ return `── [${frame.seq + 1}/${total}] +${secs}s · ${frame.triggers.join(
+ '+',
+ )} · screen=${frame.screen}${tasks} ──`;
+}
diff --git a/src/lib/ci-driver/wizard-ci-driver.ts b/src/lib/ci-driver/wizard-ci-driver.ts
new file mode 100644
index 00000000..cfa85a11
--- /dev/null
+++ b/src/lib/ci-driver/wizard-ci-driver.ts
@@ -0,0 +1,192 @@
+/**
+ * WizardCiDriver — the read/act control plane over a live WizardStore.
+ *
+ * This is the SDK-free core of wizard-ci-tools. A test harness or a driver LLM
+ * uses three primitives to run a real wizard end-to-end without a terminal:
+ *
+ * readState() — a truthful projection of the committed store state
+ * (the same state the Ink render is a pure function of),
+ * plus the derived currentScreen/hasOverlay so the snapshot
+ * is complete without reaching into router internals.
+ * listActions() — the commit actions legal on the current screen.
+ * performAction() — invoke one, via the exact store setter the Ink screen's
+ * keyboard handler would call, and return the next state.
+ *
+ * It observes *committed* state and actuates *commits*. In-progress keystroke
+ * state (typed-but-unsubmitted text, highlighted option, the wizard_ask
+ * per-question accumulator) is React-local and deliberately invisible here —
+ * the driver issues the final commit directly instead.
+ */
+
+import type { WizardStore } from '@ui/tui/store';
+import type { ScreenName } from '@ui/tui/router';
+import type { PendingQuestion, RunPhase } from '@lib/wizard-session';
+import { actionsForScreen, MissingParamError } from './action-registry.js';
+
+/** A setup question projected for the harness (no `detect` fn, no closures). */
+export interface SetupQuestionView {
+ key: string;
+ message: string;
+ options: Array<{ label: string; value: string; hint?: string }>;
+}
+
+/** The action surface as seen by a caller (no `apply` closure). */
+export interface ActionView {
+ id: string;
+ description: string;
+ params?: Record;
+}
+
+/**
+ * The serialized observable state. A whitelist of WizardSession — credentials
+ * are reduced to a boolean so secrets never reach a driver LLM.
+ */
+export interface CiState {
+ currentScreen: ScreenName;
+ hasOverlay: boolean;
+ runPhase: RunPhase;
+ session: {
+ installDir: string;
+ integration: string | null;
+ detectedFrameworkLabel: string | null;
+ detectionComplete: boolean;
+ setupConfirmed: boolean;
+ hasCredentials: boolean;
+ projectId: number | null;
+ mcpComplete: boolean;
+ slackStepDismissed: boolean;
+ skillsComplete: boolean;
+ outroDismissed: boolean;
+ llmOptIn: boolean;
+ discoveredFeatures: string[];
+ };
+ tasks: Array<{ label: string; status: string; activeForm?: string }>;
+ statusMessages: string[];
+ eventPlan: Array<{ name: string; description: string }>;
+ /** Present iff a wizard_ask overlay is up. */
+ pendingQuestion: PendingQuestion | null;
+ /** Unresolved framework-setup questions when on the setup screen. */
+ setupQuestions: SetupQuestionView[];
+ /** Commit actions legal on currentScreen. */
+ actions: ActionView[];
+}
+
+export class UnknownActionError extends Error {
+ constructor(action: string, screen: ScreenName) {
+ super(
+ `No action "${action}" on screen "${screen}". ` +
+ `Call list_actions / read read_state.actions first.`,
+ );
+ this.name = 'UnknownActionError';
+ }
+}
+
+export class WizardCiDriver {
+ constructor(private readonly store: WizardStore) {}
+
+ /** Snapshot the committed state plus the derived screen. */
+ readState(): CiState {
+ const s = this.store.session;
+ const screen = this.store.currentScreen;
+ return {
+ currentScreen: screen,
+ hasOverlay: this.store.router.hasOverlay,
+ runPhase: s.runPhase,
+ session: {
+ installDir: s.installDir,
+ integration: s.integration,
+ detectedFrameworkLabel: s.detectedFrameworkLabel,
+ detectionComplete: s.detectionComplete,
+ setupConfirmed: s.setupConfirmed,
+ hasCredentials: s.credentials !== null,
+ projectId: s.credentials?.projectId ?? null,
+ mcpComplete: s.mcpComplete,
+ slackStepDismissed: s.slackStepDismissed,
+ skillsComplete: s.skillsComplete,
+ outroDismissed: s.outroDismissed,
+ llmOptIn: s.llmOptIn,
+ discoveredFeatures: [...s.discoveredFeatures],
+ },
+ tasks: this.store.tasks.map((t) => ({
+ label: t.label,
+ status: t.status,
+ activeForm: t.activeForm,
+ })),
+ statusMessages: [...this.store.statusMessages],
+ eventPlan: this.store.eventPlan.map((e) => ({
+ name: e.name,
+ description: e.description,
+ })),
+ pendingQuestion: s.pendingQuestion ?? null,
+ setupQuestions: this.unresolvedSetupQuestions(),
+ actions: this.listActions(),
+ };
+ }
+
+ /** Commit actions legal on the current screen. */
+ listActions(): ActionView[] {
+ return actionsForScreen(this.store.currentScreen).map((a) => ({
+ id: a.id,
+ description: a.description,
+ ...(a.params ? { params: a.params } : {}),
+ }));
+ }
+
+ /**
+ * Apply a named action via its store setter, then return the next state.
+ * Throws UnknownActionError if the action isn't legal on the current screen,
+ * or MissingParamError if a required param is absent.
+ */
+ performAction(
+ actionId: string,
+ params: Record = {},
+ ): CiState {
+ const screen = this.store.currentScreen;
+ const action = actionsForScreen(screen).find((a) => a.id === actionId);
+ if (!action) throw new UnknownActionError(actionId, screen);
+ action.apply(this.store, params); // may throw MissingParamError
+ return this.readState();
+ }
+
+ /**
+ * Resolve once the rendered screen changes (or a wizard_ask overlay opens),
+ * or after timeoutMs. Lets a driver loop block on the next decision point
+ * instead of polling — the store fires its version listener on every commit,
+ * including the agent's getUI() calls.
+ */
+ waitForChange(timeoutMs = 120_000): Promise {
+ const before = this.store.currentScreen;
+ return new Promise((resolve) => {
+ let settled = false;
+ const finish = () => {
+ if (settled) return;
+ settled = true;
+ clearTimeout(timer);
+ unsub();
+ resolve(this.readState());
+ };
+ const timer = setTimeout(finish, timeoutMs);
+ const unsub = this.store.subscribe(() => {
+ if (this.store.currentScreen !== before) finish();
+ });
+ });
+ }
+
+ private unresolvedSetupQuestions(): SetupQuestionView[] {
+ const s = this.store.session;
+ const questions = s.frameworkConfig?.metadata.setup?.questions ?? [];
+ return questions
+ .filter((q) => !(q.key in s.frameworkContext))
+ .map((q) => ({
+ key: q.key,
+ message: q.message,
+ options: q.options.map((o) => ({
+ label: o.label,
+ value: o.value,
+ ...(o.hint ? { hint: o.hint } : {}),
+ })),
+ }));
+ }
+}
+
+export { MissingParamError };
diff --git a/src/lib/ci-driver/wizard-ci-tools.ts b/src/lib/ci-driver/wizard-ci-tools.ts
new file mode 100644
index 00000000..f0d6db09
--- /dev/null
+++ b/src/lib/ci-driver/wizard-ci-tools.ts
@@ -0,0 +1,108 @@
+/**
+ * wizard-ci-tools — in-process MCP server exposing the WizardCiDriver.
+ *
+ * A thin SDK adapter over {@link WizardCiDriver}: three tools that let an
+ * external driver (a test harness or an LLM) read the wizard's committed state
+ * and commit decisions, driving a real run with no terminal.
+ *
+ * read_state — truthful snapshot + derived currentScreen + legal actions
+ * list_actions — commit actions legal on the current screen
+ * perform_action — invoke one (via the store setter the Ink screen would)
+ *
+ * Mirrors wizard-tools.ts: pure adapter behind a seam (the driver), importing
+ * no product knowledge. The driver does the work; this just speaks MCP. The
+ * SDK is dynamically imported so this module loads even where the SDK is mocked.
+ */
+
+import { z } from 'zod';
+import type { WizardCiDriver } from './wizard-ci-driver.js';
+import { UnknownActionError, MissingParamError } from './wizard-ci-driver.js';
+
+let _sdkModule: unknown = null;
+async function getSDKModule(): Promise<{
+ tool: (...args: unknown[]) => unknown;
+ createSdkMcpServer: (opts: unknown) => unknown;
+}> {
+ if (!_sdkModule) {
+ _sdkModule = await import('@anthropic-ai/claude-agent-sdk');
+ }
+ return _sdkModule as never;
+}
+
+export const CI_TOOLS_SERVER_NAME = 'wizard-ci-tools';
+
+const ok = (data: unknown) => ({
+ content: [{ type: 'text' as const, text: JSON.stringify(data, null, 2) }],
+});
+const err = (message: string) => ({
+ content: [{ type: 'text' as const, text: `Error: ${message}` }],
+ isError: true,
+});
+
+/** Create the wizard-ci-tools MCP server bound to a live driver. */
+export async function createWizardCiToolsServer(
+ driver: WizardCiDriver,
+): Promise {
+ const sdk = await getSDKModule();
+ const { tool, createSdkMcpServer } = sdk;
+
+ const readState = tool(
+ 'read_state',
+ "Read the wizard's current committed state: the active screen, run phase, " +
+ 'a whitelisted view of the session, agent tasks/status/event-plan, any ' +
+ 'pending wizard_ask question, unresolved setup questions, and the commit ' +
+ 'actions legal right now. Call this first and after every perform_action.',
+ {},
+ () => ok(driver.readState()),
+ );
+
+ const listActions = tool(
+ 'list_actions',
+ 'List the commit actions legal on the current screen, with their params. ' +
+ 'Each maps to the same store mutation the interactive UI would perform.',
+ {},
+ () =>
+ ok({
+ currentScreen: driver.readState().currentScreen,
+ actions: driver.listActions(),
+ }),
+ );
+
+ const performAction = tool(
+ 'perform_action',
+ 'Commit a decision by invoking a legal action for the current screen ' +
+ '(e.g. confirm_setup, choose, answer_question, set_mcp_outcome, ' +
+ 'dismiss_outro, keep_skills). Returns the next state. The action must ' +
+ 'appear in read_state.actions for the current screen.',
+ {
+ action: z.string().describe('Action id from read_state.actions'),
+ params: z
+ .record(z.string(), z.unknown())
+ .optional()
+ .describe('Action params, e.g. { answers: { router: "app" } }'),
+ },
+ (args: { action: string; params?: Record }) => {
+ try {
+ return ok(driver.performAction(args.action, args.params ?? {}));
+ } catch (e) {
+ if (e instanceof UnknownActionError || e instanceof MissingParamError) {
+ return err(e.message);
+ }
+ return err(e instanceof Error ? e.message : String(e));
+ }
+ },
+ );
+
+ return createSdkMcpServer({
+ name: CI_TOOLS_SERVER_NAME,
+ version: '1.0.0',
+ tools: [readState, listActions, performAction],
+ });
+}
+
+/** Fully-qualified MCP tool names, for allowedTools wiring. */
+export const CI_TOOL_NAMES = {
+ readState: `mcp__${CI_TOOLS_SERVER_NAME}__read_state`,
+ listActions: `mcp__${CI_TOOLS_SERVER_NAME}__list_actions`,
+ performAction: `mcp__${CI_TOOLS_SERVER_NAME}__perform_action`,
+} as const;
diff --git a/src/lib/programs/posthog-integration/index.ts b/src/lib/programs/posthog-integration/index.ts
index bbff84be..cf7d7081 100644
--- a/src/lib/programs/posthog-integration/index.ts
+++ b/src/lib/programs/posthog-integration/index.ts
@@ -49,6 +49,19 @@ export const posthogIntegrationConfig: ProgramConfig = {
id: 'posthog-integration',
steps: POSTHOG_INTEGRATION_PROGRAM,
getContentBlocks,
+
+ // E2E test definition — the UI choices `wizard-ci --e2e` makes driving this
+ // flow headlessly: confirm intro, push past any health-check issue, pick the
+ // first setup option, skip MCP + Slack, and delete the installed skills.
+ e2e: {
+ setup: 'first',
+ healthCheck: 'dismiss',
+ mcp: 'skip',
+ slack: 'skip',
+ skills: 'delete',
+ ask: 'first',
+ },
+
// Basic integration runs without structured user input; drop wizard_ask
// so the model can't pop modal prompts mid-run. The runner forwards this
// list to the general-purpose subagent as well, so dispatched subagents
diff --git a/src/lib/programs/program-step.ts b/src/lib/programs/program-step.ts
index 55a91167..2ebd870c 100644
--- a/src/lib/programs/program-step.ts
+++ b/src/lib/programs/program-step.ts
@@ -247,6 +247,13 @@ export interface ProgramConfig {
* `ProgramCliSurface` for semantics.
*/
cli?: ProgramCliSurface;
+ /**
+ * E2E test definition: the UI choices a headless control-plane run
+ * (`wizard-ci --e2e`) makes at each decision point of THIS program's flow.
+ * Product knowledge about the flow lives here, not in the test harness.
+ * Absent → the program isn't e2e-drivable yet. See `WizardE2eProfile`.
+ */
+ e2e?: import('@lib/ci-driver/e2e-profile').WizardE2eProfile;
}
/**
From cd437a76913cecf2da0ad659901c9c00eb2633aa Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Sun, 21 Jun 2026 11:11:29 -0400
Subject: [PATCH 02/38] refactor(posthog-integration): extract e2e profile to
its own file
The e2e UI-choices object moves out of index.ts into a co-located e2e.ts
(POSTHOG_INTEGRATION_E2E_PROFILE), keeping the program config lean and the
flow's test definition in its own file.
Co-Authored-By: Claude Opus 4.8
---
src/lib/programs/posthog-integration/e2e.ts | 23 +++++++++++++++++++
src/lib/programs/posthog-integration/index.ts | 13 +++--------
2 files changed, 26 insertions(+), 10 deletions(-)
create mode 100644 src/lib/programs/posthog-integration/e2e.ts
diff --git a/src/lib/programs/posthog-integration/e2e.ts b/src/lib/programs/posthog-integration/e2e.ts
new file mode 100644
index 00000000..7affe7fc
--- /dev/null
+++ b/src/lib/programs/posthog-integration/e2e.ts
@@ -0,0 +1,23 @@
+/**
+ * E2E test definition for the PostHog integration flow — the UI choices
+ * `wizard-ci --e2e` makes when driving this program headlessly.
+ *
+ * Lives next to the program (not in the test harness) because the choices are
+ * product knowledge about this flow. The harness reads it via
+ * `ProgramConfig.e2e` and asks `decideE2eAction` what to commit on each screen.
+ */
+
+import type { WizardE2eProfile } from '@lib/ci-driver/e2e-profile';
+
+/**
+ * Happy path: confirm the intro, push past any health-check issue, pick the
+ * first setup option, skip MCP + Slack, and delete the installed skills.
+ */
+export const POSTHOG_INTEGRATION_E2E_PROFILE: WizardE2eProfile = {
+ setup: 'first',
+ healthCheck: 'dismiss',
+ mcp: 'skip',
+ slack: 'skip',
+ skills: 'delete',
+ ask: 'first',
+};
diff --git a/src/lib/programs/posthog-integration/index.ts b/src/lib/programs/posthog-integration/index.ts
index cf7d7081..a1156390 100644
--- a/src/lib/programs/posthog-integration/index.ts
+++ b/src/lib/programs/posthog-integration/index.ts
@@ -20,6 +20,7 @@ import { requestDeepLink } from '@utils/provisioning';
import { openTrackedLink, withUtm } from '@utils/links';
import type { CloudRegion } from '@utils/types';
import { POSTHOG_INTEGRATION_PROGRAM } from './steps.js';
+import { POSTHOG_INTEGRATION_E2E_PROFILE } from './e2e.js';
import { getContentBlocks } from './content/index.js';
import { buildCodingAgentPrompt } from './handoff.js';
@@ -51,16 +52,8 @@ export const posthogIntegrationConfig: ProgramConfig = {
getContentBlocks,
// E2E test definition — the UI choices `wizard-ci --e2e` makes driving this
- // flow headlessly: confirm intro, push past any health-check issue, pick the
- // first setup option, skip MCP + Slack, and delete the installed skills.
- e2e: {
- setup: 'first',
- healthCheck: 'dismiss',
- mcp: 'skip',
- slack: 'skip',
- skills: 'delete',
- ask: 'first',
- },
+ // flow headlessly. Lives in ./e2e.ts (product knowledge, not harness logic).
+ e2e: POSTHOG_INTEGRATION_E2E_PROFILE,
// Basic integration runs without structured user input; drop wizard_ask
// so the model can't pop modal prompts mid-run. The runner forwards this
From 1c2dca89f4e7944ea07160a15763120f608c337c Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Sun, 21 Jun 2026 11:12:02 -0400
Subject: [PATCH 03/38] docs(ci-driver): point the agent guide at the extracted
e2e profile file
Co-Authored-By: Claude Opus 4.8
---
src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md b/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
index a0c53b02..83396588 100644
--- a/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
+++ b/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
@@ -84,8 +84,9 @@ list_actions / perform_action), the screen→action registry, and the
**To change what the run clicks**, edit the program's **e2e profile** — the UI
choices live ON the program, not in the harness:
-`posthogIntegrationConfig.e2e` (`src/lib/programs/posthog-integration/index.ts`),
-typed by `WizardE2eProfile` (`src/lib/ci-driver/e2e-profile.ts`). The harness
+the profile in `src/lib/programs/posthog-integration/e2e.ts` (wired in via
+`ProgramConfig.e2e`), typed by `WizardE2eProfile`
+(`src/lib/ci-driver/e2e-profile.ts`). The harness
asks `decideE2eAction(state, profile)` what to commit on each screen. To make a
*different* program e2e-drivable, give it an `e2e` profile too.
From f5e6a65447b549a3f1ab5a132e406ea3c03206ca Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 12:20:08 -0400
Subject: [PATCH 04/38] test(ci-driver): add offline sample-recording generator
for replay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
scripts/record-demo.no-jest.ts — produces a recording offline (no agent, no
network) by driving the integration flow with the e2e profile + a WizardRecorder,
so `replay-e2e.no-jest.ts` can be tried without a full run.
Co-Authored-By: Claude Opus 4.8
---
scripts/record-demo.no-jest.ts | 81 ++++++++++++++++++++++++++++++++++
1 file changed, 81 insertions(+)
create mode 100644 scripts/record-demo.no-jest.ts
diff --git a/scripts/record-demo.no-jest.ts b/scripts/record-demo.no-jest.ts
new file mode 100644
index 00000000..4996a4a1
--- /dev/null
+++ b/scripts/record-demo.no-jest.ts
@@ -0,0 +1,81 @@
+/**
+ * Produce a sample recording offline (no agent, no network) so you can try the
+ * replayer. Walks the integration flow with the e2e profile, injecting the
+ * external transitions a real run gets (health probe, auth, agent runPhase) and
+ * some agent status/tasks, while a WizardRecorder captures each key moment.
+ *
+ * tsx scripts/record-demo.no-jest.ts # writes /tmp/wizard-demo.recording.json
+ * tsx scripts/replay-e2e.no-jest.ts /tmp/wizard-demo.recording.json --step
+ */
+import { writeFileSync } from 'fs';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, RunPhase } from '@lib/wizard-session';
+import { Integration } from '@lib/constants';
+import { FRAMEWORK_REGISTRY } from '@lib/registry';
+import { WizardReadiness } from '@lib/health-checks/readiness';
+import { Program } from '@lib/programs/program-registry';
+import { ScreenId } from '@ui/tui/router';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import {
+ decideE2eAction,
+ DEFAULT_E2E_PROFILE,
+} from '@lib/ci-driver/e2e-profile';
+import { WizardRecorder } from '@lib/ci-driver/recorder';
+
+const out = process.env.RECORDING_OUT ?? '/tmp/wizard-demo.recording.json';
+
+const store = new WizardStore(Program.PostHogIntegration);
+setUI(new InkUI(store));
+const session = buildSession({ installDir: '/tmp/demo-app', ci: true });
+session.integration = Integration.nextjs;
+session.frameworkConfig = FRAMEWORK_REGISTRY[Integration.nextjs];
+store.session = session;
+
+let clock = 0;
+const rec = new WizardRecorder(
+ store,
+ { program: 'posthog-integration', app: 'demo-nextjs' },
+ () => (clock += 600),
+);
+rec.start();
+
+const driver = new WizardCiDriver(store);
+const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+
+for (let i = 0; i < 40; i++) {
+ const state = driver.readState();
+ const d = decideE2eAction(state, profile);
+ if (d.action) driver.performAction(d.action.id, d.action.params ?? {});
+
+ if (state.currentScreen === ScreenId.HealthCheck) {
+ store.setReadinessResult({
+ decision: WizardReadiness.Yes,
+ health: {} as never,
+ reasons: [],
+ });
+ } else if (state.currentScreen === ScreenId.Auth) {
+ store.setCredentials({
+ accessToken: 'phx_secret',
+ projectApiKey: 'phc_demo',
+ host: 'https://us.posthog.com',
+ projectId: 1,
+ });
+ } else if (state.currentScreen === ScreenId.Run) {
+ store.pushStatus('Installing posthog-js…');
+ store.setTasks([
+ { label: 'Install SDK', status: 'completed' as never, done: true },
+ ]);
+ store.pushStatus('Wiring instrumentation-client.ts…');
+ store.setRunPhase(RunPhase.Completed);
+ }
+ if (d.done || store.session.skillsComplete) break;
+}
+rec.stop();
+writeFileSync(out, JSON.stringify(rec.getRecording(), null, 2));
+process.stdout.write(`recorded ${rec.frameCount} frames → ${out}\n`);
+process.stdout.write(
+ `replay: npx tsx scripts/replay-e2e.no-jest.ts ${out} --step\n`,
+);
From 17a87770425de99f92956f909fa7a1e05b1066d3 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 12:22:22 -0400
Subject: [PATCH 05/38] docs(scripts): add README indexing the ci-driver/e2e
scripts
scripts/README.md documents the manual control-plane + record/replay tools
(what each does, what it needs, how to run). Also commits ci-driver-live-agent.ts
(real gateway LLM drives the wizard-ci-tools MCP server) so the index is complete.
Co-Authored-By: Claude Opus 4.8
---
scripts/README.md | 50 ++++++++++
scripts/ci-driver-live-agent.ts | 170 ++++++++++++++++++++++++++++++++
2 files changed, 220 insertions(+)
create mode 100644 scripts/README.md
create mode 100644 scripts/ci-driver-live-agent.ts
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..d4d11a17
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,50 @@
+# scripts/
+
+Helper scripts. The build-related ones (`generate-version.cjs`,
+`smoke-test*.sh`, `check-screens.tsx`) are wired into `package.json`. The rest
+below are **manual, runnable tools** for the `wizard-ci-tools` control plane and
+e2e — each is a standalone `tsx` entry, named `*.no-jest.ts` so Jest ignores it.
+
+Run from the repo root, e.g. `npx tsx scripts/.no-jest.ts`.
+
+## Control-plane e2e (drive the wizard headlessly via wizard-ci-tools)
+
+| Script | What it does | Needs |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
+| **`ci-driver-demo.ts`** | Drives the real store/router/detection flow with `WizardCiDriver` — **offline, agent stubbed**. Proves the control loop on a 1-file project. | nothing |
+| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result (`E2E_RESULT_JSON`) and a recording (`E2E_RECORDING_JSON`). | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
+| **`ci-driver-live-agent.ts`** | A **real gateway LLM** drives the `wizard-ci-tools` MCP server (read_state / perform_action) to advance the wizard — agent-vs-agent proof. | `PHX_KEY_FILE` |
+
+> Normally you don't call these directly — `pnpm wizard-ci --e2e` (in
+> [wizard-workbench](https://github.com/PostHog/wizard-workbench)) orchestrates
+> `e2e-full-run` with the env hygiene + assertions.
+
+## Record & replay (view a run back in the terminal)
+
+| Script | What it does | Needs |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- |
+| **`record-demo.no-jest.ts`** | Produces a sample recording **offline** (no agent, no network) by driving the flow with a `WizardRecorder`. Writes `/tmp/wizard-demo.recording.json` (override with `RECORDING_OUT`). | nothing |
+| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance, default) or `--delay ` (auto-play). | a `*.recording.json` |
+
+```bash
+# make a sample recording, then watch it
+npx tsx scripts/record-demo.no-jest.ts
+npx tsx scripts/replay-e2e.no-jest.ts /tmp/wizard-demo.recording.json --step
+```
+
+Real `--e2e` runs also drop a recording at
+`/tmp/wizard-e2e-.recording.json`.
+
+## Background
+
+The control plane lives in [`src/lib/ci-driver/`](../src/lib/ci-driver/) —
+`WizardCiDriver` (read/act over the store), the screen→action registry, the
+`wizard-ci-tools` MCP server, the e2e profile, and the recorder/replay. See
+[`DRIVING-E2E-FROM-AN-AGENT.md`](../src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md)
+for how an agent drives these (env strip, scoped project id, gotchas).
+
+> **Security-leak repro scripts** (`relay-prod.no-jest.ts`,
+> `precedence.no-jest.ts`) that reproduce the `ANTHROPIC_BASE_URL`
+> settings-override gateway leak live on the fix PR
+> ([PostHog/wizard#703](https://github.com/PostHog/wizard/pull/703)), documented
+> in its description + comments.
diff --git a/scripts/ci-driver-live-agent.ts b/scripts/ci-driver-live-agent.ts
new file mode 100644
index 00000000..94be6903
--- /dev/null
+++ b/scripts/ci-driver-live-agent.ts
@@ -0,0 +1,170 @@
+/**
+ * Live proof: a REAL gateway LLM drives the wizard-ci-tools MCP server.
+ *
+ * Configures the PostHog LLM gateway with the phx personal API key as bearer
+ * (the same "creative hack" the CI auth path uses — no OAuth, no browser),
+ * attaches the in-process wizard-ci-tools server to a real `query()`, and asks
+ * the model to read the wizard's state and advance it. Success = the model
+ * actually moved the real store off the intro screen by calling perform_action.
+ *
+ * PHX_KEY_FILE=/path/to/key.txt tsx scripts/ci-driver-live-agent.ts
+ */
+
+import fs from 'fs';
+import { query } from '@anthropic-ai/claude-agent-sdk';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession } from '@lib/wizard-session';
+import { buildAgentEnv } from '@lib/agent/agent-interface';
+import { Program } from '@lib/programs/program-registry';
+import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import {
+ createWizardCiToolsServer,
+ CI_TOOL_NAMES,
+} from '@lib/ci-driver/wizard-ci-tools';
+
+const GATEWAY_URL = 'https://gateway.us.posthog.com/wizard';
+const MODEL = 'claude-haiku-4-5-20251001';
+
+async function main() {
+ const keyFile = process.env.PHX_KEY_FILE;
+ if (!keyFile) throw new Error('Set PHX_KEY_FILE to the phx key path');
+ const phxKey = fs.readFileSync(keyFile, 'utf8').trim();
+
+ // Point the agent SDK at the PostHog gateway, phx key as bearer.
+ process.env.ANTHROPIC_BASE_URL = GATEWAY_URL;
+ process.env.ANTHROPIC_AUTH_TOKEN = phxKey;
+ process.env.CLAUDE_CODE_OAUTH_TOKEN = phxKey;
+ process.env.CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS = 'true';
+
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store));
+ store.session = buildSession({ installDir: '/tmp/ci-live', ci: true });
+ const driver = new WizardCiDriver(store);
+ const server = await createWizardCiToolsServer(driver);
+
+ process.stdout.write(
+ `\nBefore: currentScreen=${
+ driver.readState().currentScreen
+ } setupConfirmed=${store.session.setupConfirmed}\n\n`,
+ );
+
+ const prompt =
+ 'You are driving a PostHog wizard through its test control plane. ' +
+ 'The wizard is on its intro screen. Your very first action must be to call ' +
+ 'the perform_action tool with {"action":"confirm_setup"} (no other params) ' +
+ 'to advance past it. Do that immediately, before anything else. ' +
+ 'Then call read_state once and report the new currentScreen. Be terse.';
+
+ const abort = new AbortController();
+ const timer = setTimeout(() => abort.abort(), 220_000);
+
+ const toolCalls: string[] = [];
+ let finalText = '';
+
+ // Streaming-input prompt. A plain string prompt closes stdin after turn 1,
+ // which breaks every follow-up turn (the wizard hits the same SDK bug and
+ // works around it the same way). Keep the generator open until the SDK
+ // emits its `result` message so the session survives multi-turn tool use.
+ let signalDone!: () => void;
+ const resultReceived = new Promise((r) => {
+ signalDone = r;
+ });
+ const promptStream = async function* () {
+ yield {
+ type: 'user' as const,
+ session_id: '',
+ message: { role: 'user' as const, content: prompt },
+ parent_tool_use_id: null,
+ };
+ await resultReceived;
+ };
+
+ try {
+ const response = query({
+ prompt: promptStream(),
+ options: {
+ abortController: abort,
+ model: MODEL,
+ permissionMode: 'bypassPermissions',
+ betas: ['context-1m-2025-08-07'],
+ systemPrompt: { type: 'preset', preset: 'claude_code' },
+ tools: { type: 'preset', preset: 'claude_code' },
+ env: {
+ ...process.env,
+ // The user's Anthropic key (set in this shell) would override the
+ // gateway bearer and 401 — unset it so ANTHROPIC_AUTH_TOKEN wins.
+ ANTHROPIC_API_KEY: undefined,
+ ANTHROPIC_BASE_URL: GATEWAY_URL,
+ ANTHROPIC_AUTH_TOKEN: phxKey,
+ CLAUDE_CODE_OAUTH_TOKEN: phxKey,
+ ENABLE_TOOL_SEARCH: 'auto:0',
+ MCP_CONNECTION_NONBLOCKING: '0',
+ // The gateway expects PostHog's custom headers (bedrock fallback +
+ // metadata) — the wizard sets these for every real run.
+ ANTHROPIC_CUSTOM_HEADERS: buildAgentEnv({}, {}),
+ },
+ mcpServers: { [`wizard-ci-tools`]: server },
+ allowedTools: [
+ CI_TOOL_NAMES.readState,
+ CI_TOOL_NAMES.listActions,
+ CI_TOOL_NAMES.performAction,
+ ],
+ },
+ } as never);
+
+ for await (const msg of response as AsyncIterable) {
+ if (msg.type === 'assistant') {
+ for (const block of msg.message?.content ?? []) {
+ if (block.type === 'tool_use') {
+ toolCalls.push(block.name);
+ process.stdout.write(` → tool_use: ${block.name}\n`);
+ } else if (block.type === 'text' && block.text) {
+ finalText = block.text;
+ }
+ }
+ } else if (msg.type === 'result') {
+ if (msg.result) finalText = msg.result;
+ signalDone(); // close the prompt stream so the SDK can exit
+ }
+ // Stop as soon as the model has driven the store off the intro screen —
+ // one successful tool-driven commit is the proof we're after.
+ if (store.session.setupConfirmed) {
+ abort.abort();
+ break;
+ }
+ }
+ } catch (e) {
+ // A later-turn gateway error must not mask a commit that already landed —
+ // we evaluate store state below regardless.
+ process.stdout.write(
+ ` (query ended: ${e instanceof Error ? e.message.split('\n')[0] : e})\n`,
+ );
+ } finally {
+ signalDone();
+ clearTimeout(timer);
+ }
+
+ const after = driver.readState();
+ process.stdout.write(
+ `\nAfter: currentScreen=${after.currentScreen} setupConfirmed=${store.session.setupConfirmed}\n`,
+ );
+ process.stdout.write(`Model said: ${finalText.slice(0, 200)}\n`);
+ process.stdout.write(`Tool calls: ${toolCalls.join(', ') || '(none)'}\n\n`);
+
+ const advanced = store.session.setupConfirmed === true;
+ process.stdout.write(
+ `${
+ advanced
+ ? '✓ LLM advanced the real store via wizard-ci-tools'
+ : '✗ store did not advance'
+ }\n\n`,
+ );
+ process.exit(advanced ? 0 : 1);
+}
+
+main().catch((e) => {
+ process.stderr.write(`\nLIVE_FAIL: ${e?.stack ?? e}\n`);
+ process.exit(1);
+});
From c8eacca64f0d7186531a3a7a3bfb92d7a997b39e Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 13:51:03 -0400
Subject: [PATCH 06/38] fix(ci-driver): classify warehouse-intro +
self-driving-intro screens
main added two confirm-and-continue intro screens (WarehouseIntro,
SelfDrivingIntro, both call store.completeSetup()). The action-registry
exhaustiveness test flagged them as uncovered. Register both as confirm_setup
in ACTION_REGISTRY and in the e2e walk policy.
Co-Authored-By: Claude Opus 4.8
---
src/lib/ci-driver/action-registry.ts | 2 ++
src/lib/ci-driver/e2e-profile.ts | 2 ++
2 files changed, 4 insertions(+)
diff --git a/src/lib/ci-driver/action-registry.ts b/src/lib/ci-driver/action-registry.ts
index 57e9f080..cde6315a 100644
--- a/src/lib/ci-driver/action-registry.ts
+++ b/src/lib/ci-driver/action-registry.ts
@@ -93,6 +93,8 @@ export const ACTION_REGISTRY: Partial> = {
[ScreenId.AgentSkillIntro]: [confirmSetupAction],
[ScreenId.AuditIntro]: [confirmSetupAction],
[ScreenId.DoctorIntro]: [confirmSetupAction],
+ [ScreenId.WarehouseIntro]: [confirmSetupAction],
+ [ScreenId.SelfDrivingIntro]: [confirmSetupAction],
// ── Health check — dismiss a blocking outage ──────────────────────────
[ScreenId.HealthCheck]: [
diff --git a/src/lib/ci-driver/e2e-profile.ts b/src/lib/ci-driver/e2e-profile.ts
index 9ed909ee..1551c65a 100644
--- a/src/lib/ci-driver/e2e-profile.ts
+++ b/src/lib/ci-driver/e2e-profile.ts
@@ -73,6 +73,8 @@ export function decideE2eAction(
case ScreenId.AuditIntro:
case ScreenId.SourceMapsIntro:
case ScreenId.DoctorIntro:
+ case ScreenId.WarehouseIntro:
+ case ScreenId.SelfDrivingIntro:
return { action: { id: 'confirm_setup' } };
case ScreenId.HealthCheck:
From b1a43ee295ffdcb4b6445b0403846ac5744b8ae5 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 13:55:44 -0400
Subject: [PATCH 07/38] docs(ci-driver): rename agent guide to ARCHITECTURE.md,
strip internal refs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Move DRIVING-E2E-FROM-AN-AGENT.md → ARCHITECTURE.md to match the co-located
subsystem-doc convention (cf. programs/self-driving/ARCHITECTURE.md). Remove
content that shouldn't ship in the public repo: the internal test project id +
team name, the workbench test-api-key.txt secret file, and pointers to
workbench-only scratch files. Keep the architecture, profiles, record/replay, and
MCP-loop guidance; generalize the run instructions. Update the scripts/README link.
Co-Authored-By: Claude Opus 4.8
---
scripts/README.md | 4 +-
src/lib/ci-driver/ARCHITECTURE.md | 115 +++++++++++++++
.../ci-driver/DRIVING-E2E-FROM-AN-AGENT.md | 131 ------------------
3 files changed, 117 insertions(+), 133 deletions(-)
create mode 100644 src/lib/ci-driver/ARCHITECTURE.md
delete mode 100644 src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
diff --git a/scripts/README.md b/scripts/README.md
index d4d11a17..38dc8a7d 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -40,8 +40,8 @@ Real `--e2e` runs also drop a recording at
The control plane lives in [`src/lib/ci-driver/`](../src/lib/ci-driver/) —
`WizardCiDriver` (read/act over the store), the screen→action registry, the
`wizard-ci-tools` MCP server, the e2e profile, and the recorder/replay. See
-[`DRIVING-E2E-FROM-AN-AGENT.md`](../src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md)
-for how an agent drives these (env strip, scoped project id, gotchas).
+[`ARCHITECTURE.md`](../src/lib/ci-driver/ARCHITECTURE.md) for how an agent
+drives these (env strip, scoped project id, gotchas).
> **Security-leak repro scripts** (`relay-prod.no-jest.ts`,
> `precedence.no-jest.ts`) that reproduce the `ANTHROPIC_BASE_URL`
diff --git a/src/lib/ci-driver/ARCHITECTURE.md b/src/lib/ci-driver/ARCHITECTURE.md
new file mode 100644
index 00000000..a3b3be3a
--- /dev/null
+++ b/src/lib/ci-driver/ARCHITECTURE.md
@@ -0,0 +1,115 @@
+# ci-driver — Headless e2e Control Plane
+
+How an agent (or a script) drives a **real** wizard run end-to-end with no
+terminal and no browser, and asserts it worked. This is the control-plane path:
+it runs the WHOLE interactive flow headlessly via `wizard-ci-tools` and asserts
+on structured state — not the classic `--ci` mode (LoggingUI, stdout-grep,
+agent-only).
+
+## The pieces
+
+```
+src/lib/ci-driver/
+ wizard-ci-driver.ts WizardCiDriver — read_state / list_actions / perform_action
+ action-registry.ts screen → the actions legal on it (+ NO_ACTION_SCREENS)
+ wizard-ci-tools.ts in-process MCP server exposing the driver to an external loop
+ e2e-profile.ts WizardE2eProfile + decideE2eAction — the scripted walk policy
+ recorder.ts captures a run as key-moment frames
+ replay.ts reconstructs a frame's store and renders the real Ink screen
+```
+
+The driver reads and mutates the **real** `WizardStore`: the router resolves the
+active screen from session state, every action goes through a store setter, and
+the render is a pure projection of that state. So driving the store headlessly
+exercises exactly the code an interactive run would.
+
+## Driving a run
+
+A headless run wires a real `WizardStore` + `InkUI` (never rendered), a
+concurrent `WizardCiDriver`, and the real `runAgent` against the gateway. The
+loop is:
+
+```
+read_state → decideE2eAction(state, profile) → perform_action → repeat
+```
+
+`scripts/e2e-full-run.no-jest.ts` is the runnable harness; the
+[wizard-workbench](https://github.com/PostHog/wizard-workbench)
+`wizard-ci --e2e` command orchestrates it (copies the app to a scratch dir,
+strips the host env, asserts on the result). Run shape:
+
+```bash
+POSTHOG_PERSONAL_API_KEY=… POSTHOG_REGION=us \
+ npx tsx scripts/e2e-full-run.no-jest.ts # APP_DIR, PROJECT_ID via env
+```
+
+### Four things that bite
+
+1. **Running inside an agent session.** Host env (`CLAUDECODE`, `ANTHROPIC_*`,
+ `CLAUDE_CODE_*`) makes the wizard's spawned agent defer auth to the host →
+ `apiKeySource: none` → 401. The harness strips these for the child; if you
+ invoke it directly, strip them yourself. A plain CI shell never has them.
+2. **A project-scoped key needs its project id.** A personal key scoped to one
+ team must be given that team's `--project-id` (or
+ `POSTHOG_WIZARD_PROJECT_ID`), or bootstrap 403s on project-data fetch. The
+ key still authenticates — it just isn't scoped to the default team.
+3. **Never run on a real fixture.** Always a throwaway copy; the harness does
+ this.
+4. **Runs are sequential and minutes long** (~3–8 min, gateway round-trips
+ dominate). The agent log is one shared file — never run two at once.
+
+## Reading the result
+
+The harness emits a JSON result; assert on:
+
+| field | pass when |
+| --------------------------- | ------------------------------------------------ |
+| `runPhase` | `"completed"` (the agent finished) |
+| `hasPosthogDep` / `envFile` | a posthog dep was added and/or a `.env*` written |
+| `screenPath` | includes `keep-skills` (full flow walked) |
+| `skillsComplete` | `true` (run reached its done-signal) |
+| `skillsDeleted` | `true` when policy = delete |
+
+## Changing what the run does
+
+The UI choices live **on the program**, not in the harness — product knowledge
+stays out of infrastructure. Edit the program's e2e profile
+(`src/lib/programs/posthog-integration/e2e.ts`, wired via `ProgramConfig.e2e`,
+typed by `WizardE2eProfile`). The harness asks `decideE2eAction(state, profile)`
+what to commit on each screen. To make another program e2e-drivable, give it an
+`e2e` profile too.
+
+The flow is **snapshot-tested** offline (no agent, deterministic):
+`__tests__/e2e-flow-snapshot.test.ts` golden-checks the (screen → decision)
+trace. Update with `jest -u` after an intentional flow/profile change. This is
+the structured-state analog of the TUI ANSI screenshots in
+`scripts/__screenshots__/`.
+
+## Record & replay
+
+Every run is recorded as a timeline of **key-moment frames** — one each time the
+store/router changes (a route, a task-list update, a status line, a runPhase
+change, an overlay). Replay reconstructs each frame's store and renders the real
+Ink screen back to the terminal, so a run can be watched back to verify it:
+
+```bash
+npx tsx scripts/record-demo.no-jest.ts # sample, offline
+npx tsx scripts/replay-e2e.no-jest.ts --step # Enter ▸ step
+npx tsx scripts/replay-e2e.no-jest.ts --delay 1200 # auto-play
+```
+
+An agent that can't sit in the stepper can instead read the recording JSON
+directly (each frame has `triggers`, `screen`, `tasks`, `statusMessages`,
+redacted `session`) or render specific frames to ANSI with `renderFrame()` from
+`replay.ts`. The access token is redacted, so recordings are safe to share.
+Code: `recorder.ts` (capture) + `replay.ts` (render).
+
+## Driving it as a true LLM loop (optional)
+
+`wizard-ci-tools.ts` exposes `read_state` / `list_actions` / `perform_action` as
+an in-process MCP server. To have a model (not a scripted profile) play the
+user, connect a driver model and loop `read_state → reason → perform_action`.
+For deterministic CI prefer the scripted profile; reserve the LLM loop for
+fuzzing the flow. Note: a multi-turn driver must route through the wizard's real
+agent initialization for gateway auth — a bare `query()` 401s on the follow-up
+turn.
diff --git a/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md b/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
deleted file mode 100644
index 83396588..00000000
--- a/src/lib/ci-driver/DRIVING-E2E-FROM-AN-AGENT.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# Driving wizard e2e runs from an agent
-
-For a future AI agent asked to run a **real** wizard integration end-to-end and
-check it worked. This is the control-plane path (`wizard-ci --e2e`): it runs the
-WHOLE interactive flow headlessly via `wizard-ci-tools` and asserts on structured
-state — not the classic `--ci` (LoggingUI, stdout-grep, agent-only).
-
-It complements the human runbook `workbench/ci-verify-plan.md` (read that too —
-it has the key/region/build-channel facts). This doc is the agent-specific how-to.
-
-## The one command
-
-```bash
-cd /wizard-workbench
-WIZARD_PATH=/wizard \
-POSTHOG_PERSONAL_API_KEY="$(cat /test-api-key.txt)" \
-POSTHOG_REGION=us \
- npx tsx services/wizard-ci/index.ts \
- basic-integration/javascript-node/express-todo --e2e --project-id 228144
-```
-
-Pass `--keep-skills` to keep the installed skills (default deletes them). Swap the
-app path for any `apps/<...>` dir (e.g. `basic-integration/next-js/15-app-router-todo`).
-
-It copies the app to `/tmp`, runs the real agent against prod cloud, drives every
-screen, and prints `✓ E2E PASS` / `✗ E2E FAIL` + a `/tmp/wizard-e2e-.json`
-result. Exit 0 = pass. A run takes **~3-8 min** (gateway round-trips dominate).
-
-## The four things that bite an agent (and why)
-
-1. **You are running INSIDE a Claude Code session.** Its env
- (`CLAUDECODE`, `CLAUDE_CODE_SDK_HAS_*_REFRESH`, `ANTHROPIC_*`, …) makes the
- wizard's spawned agent defer auth to the host → `apiKeySource: none` → **401
- auth-error**. The wizard-ci `--e2e` path strips these for the child, so the
- one command above is safe. If you ever invoke the harness directly, strip them
- yourself (see `STRIP_ENV` in `services/wizard-ci/e2e.ts`). A plain CI shell
- doesn't have these, so it never hits this.
-
-2. **The test key is project-scoped.** `test-api-key.txt` only reads project
- **228144** ("cookiesssss", US). Without `--project-id 228144` (or
- `POSTHOG_WIZARD_PROJECT_ID`), bootstrap 403s ("Access denied while trying to
- fetch project data"). The key is still valid — it authenticates and works as
- the LLM gateway bearer; it just isn't scoped to the default team.
-
-3. **Never run on the real fixture.** Always a `/tmp` copy (the harness does
- this). The runbook: after any accidental run on a real app, `git checkout` it.
-
-4. **Runs are sequential, and minutes long.** The agent log is a single shared
- file (`/tmp/posthog-wizard.log`) — never run two at once. Launch with
- `run_in_background: true` and watch with a Monitor on the output file; don't
- block. Watch for: `screen →`, `assertions`, `E2E PASS/FAIL`, and `auth-error`.
-
-## How to read the result
-
-`/tmp/wizard-e2e-.json` (and the stdout assertions):
-
-| field | pass when |
-|---|---|
-| `runPhase` | `"completed"` (the agent finished) |
-| `hasPosthogDep` / `envFile` | a posthog dep was added and/or a `.env*` written |
-| `screenPath` | includes `keep-skills` (full flow walked) |
-| `skillsComplete` | `true` (run reached its done-signal) |
-| `skillsDeleted` | `true` when policy = delete |
-
-Also eyeball the `/tmp/` copy: `package.json` has `posthog-*`, an `.env*`
-has `POSTHOG_*`, and framework-specific files exist (e.g. Next.js
-`instrumentation-client.ts` with `posthog.init(...)`).
-
-## How it's built (so you can change it)
-
-```
-wizard-ci --e2e (workbench/services/wizard-ci/{index,e2e}.ts)
- → spawns the wizard repo's headless harness (env-stripped, /tmp copy):
- wizard/scripts/e2e-full-run.no-jest.ts
- · real WizardStore + InkUI (never rendered) — no terminal, no browser
- · real runAgent → prod gateway (phx key as bearer, --project-id)
- · a concurrent WizardCiDriver drives each screen
- → reads E2E_RESULT_JSON and asserts
-```
-
-The driver is `wizard/src/lib/ci-driver/` — `WizardCiDriver` (read_state /
-list_actions / perform_action), the screen→action registry, and the
-`wizard-ci-tools` MCP server.
-
-**To change what the run clicks**, edit the program's **e2e profile** — the UI
-choices live ON the program, not in the harness:
-the profile in `src/lib/programs/posthog-integration/e2e.ts` (wired in via
-`ProgramConfig.e2e`), typed by `WizardE2eProfile`
-(`src/lib/ci-driver/e2e-profile.ts`). The harness
-asks `decideE2eAction(state, profile)` what to commit on each screen. To make a
-*different* program e2e-drivable, give it an `e2e` profile too.
-
-**The flow is snapshot-tested** offline (no agent, deterministic):
-`src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts` golden-checks the
-(screen → decision) trace. If you change the flow or a profile, update with
-`jest -u`. This is the structured-state analog of the TUI ANSI screenshots in
-`scripts/__screenshots__/`.
-
-## Record & replay (verify a run after the fact)
-
-Every `--e2e` run is **recorded** as a timeline of key-moment frames — one each
-time the store/router changes (a route change, a task-list update, a new status
-line, a runPhase change, an overlay). The recording lands at
-`/tmp/wizard-e2e-.recording.json` and the run prints the replay command.
-
-Replay reconstructs each frame's store and renders the **real Ink screen** back
-to the terminal, so you (agent or human) can watch the run play back to verify it:
-
-```bash
-pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json # Enter ▸ step
-pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --delay 1200 # auto
-```
-
-As an agent you can't sit in the interactive stepper, but you can: (a) read the
-recording JSON directly (each frame has `triggers`, `screen`, `tasks`,
-`statusMessages`, redacted `session`) to assert the run hit the right moments, or
-(b) render specific frames to ANSI offline with `renderFrame()` from
-`src/lib/ci-driver/replay.ts`. The access token is redacted, so recordings are
-safe to share. Code: `recorder.ts` (capture) + `replay.ts` (render).
-
-## Driving it as a true LLM loop (optional)
-
-The `wizard-ci-tools` MCP server exposes `read_state` / `list_actions` /
-`perform_action` to an external driver. To have an LLM (not a scripted profile)
-play the user, connect a driver model to that server and loop
-`read_state → reason → perform_action`. Proven working: a gateway model called
-`perform_action {action:"confirm_setup"}` and advanced the real store. For
-deterministic CI, prefer the scripted profile above; reserve the LLM loop for
-fuzzing the flow. Auth caveat: a bare `query()` 401s on the follow-up turn
-through the `/wizard` gateway — route through the wizard's real `initializeAgent`
-for multi-turn (see `wizard-ci-tools-research.md`).
From 61527e80a30e689284fd145f2e20f5ab0bdb2de5 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 15:52:48 -0400
Subject: [PATCH 08/38] feat(ci-driver): render a recording to per-frame TUI
snapshots
scripts/render-snapshots.no-jest.ts renders every key-moment frame of a recording
to a real-Ink ANSI snapshot (one -.ans per frame), via replay's
renderFrame under tsx. These feed the workbench visual-regression flow.
Co-Authored-By: Claude Opus 4.8
---
scripts/render-snapshots.no-jest.ts | 38 +++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
create mode 100644 scripts/render-snapshots.no-jest.ts
diff --git a/scripts/render-snapshots.no-jest.ts b/scripts/render-snapshots.no-jest.ts
new file mode 100644
index 00000000..401c036b
--- /dev/null
+++ b/scripts/render-snapshots.no-jest.ts
@@ -0,0 +1,38 @@
+/**
+ * Render a recording to per-frame TUI snapshots — one `.ans` file per key
+ * moment, the REAL Ink screen rendered to ANSI (via replay's renderFrame, which
+ * needs real ink, hence tsx not jest).
+ *
+ * These are the snapshots the workbench's visual-comparison flow diffs against a
+ * committed baseline. A recording comes from a real `--e2e` run, so the
+ * snapshots are what the user actually saw; run-to-run differences (e.g. the
+ * agent enqueuing a different task) show up in the side-by-side for a human to
+ * review.
+ *
+ * tsx scripts/render-snapshots.no-jest.ts
+ */
+import { mkdirSync, writeFileSync, rmSync } from 'fs';
+import { join } from 'path';
+import type { ProgramId } from '@ui/tui/router';
+import { loadRecording, renderFrame } from '@lib/ci-driver/replay';
+
+const [recordingPath, outDir] = process.argv.slice(2);
+if (!recordingPath || !outDir) {
+ process.stderr.write('usage: render-snapshots \n');
+ process.exit(2);
+}
+
+const rec = loadRecording(recordingPath);
+const program = rec.meta.program as ProgramId;
+
+rmSync(outDir, { recursive: true, force: true });
+mkdirSync(outDir, { recursive: true });
+
+for (const frame of rec.frames) {
+ const seq = String(frame.seq).padStart(2, '0');
+ // One file per key moment: -[-].ans
+ const name = `${seq}-${frame.screen}.ans`;
+ writeFileSync(join(outDir, name), renderFrame(frame, program));
+}
+
+process.stdout.write(`rendered ${rec.frames.length} snapshots → ${outDir}\n`);
From a55ccbd3c15bef68c2a8c2fc0671349e5d075fce Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:17:26 -0400
Subject: [PATCH 09/38] refactor: move e2e/recording harness out of prod src
into e2e-harness/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
None of the control-plane / recording / e2e machinery belongs in the wizard's
production source. Relocate src/lib/ci-driver/ → e2e-harness/ at the repo root
(next to e2e-tests/), and sever every prod coupling:
- Remove the ProgramConfig.e2e field (program-step.ts) and the on-program profile
(delete posthog-integration/e2e.ts, unwire index.ts). Per-program profiles now
live in the harness — e2e-harness/profiles.ts, profileFor(programId).
- Add an @e2e-harness/* path alias (tsconfig.build.json + jest moduleNameMapper);
repoint scripts/tests off @lib/ci-driver.
Result: src/ has ZERO references to the harness, and the published tsdown bundle
contains none of it (previously the ~90-byte profile object shipped). Full suite
(1045 tests, 3 snapshots) passes; real-recording render verified under tsx.
Co-Authored-By: Claude Opus 4.8
---
.../ci-driver => e2e-harness}/ARCHITECTURE.md | 19 ++++++----
.../e2e-flow-snapshot.test.ts.snap | 0
.../__tests__/e2e-flow-snapshot.test.ts | 8 ++---
.../__tests__/recorder.test.ts | 6 ++--
.../__tests__/wizard-ci-driver.test.ts | 0
.../action-registry.ts | 0
.../ci-driver => e2e-harness}/e2e-profile.ts | 11 +++---
e2e-harness/profiles.ts | 36 +++++++++++++++++++
.../lib/ci-driver => e2e-harness}/recorder.ts | 0
{src/lib/ci-driver => e2e-harness}/replay.ts | 0
.../wizard-ci-driver.ts | 0
.../wizard-ci-tools.ts | 0
package.json | 1 +
scripts/README.md | 10 +++---
scripts/ci-driver-demo.ts | 2 +-
scripts/ci-driver-live-agent.ts | 4 +--
scripts/e2e-full-run.no-jest.ts | 17 +++++----
scripts/record-demo.no-jest.ts | 13 +++----
scripts/render-snapshots.no-jest.ts | 2 +-
scripts/replay-e2e.no-jest.ts | 2 +-
src/lib/programs/posthog-integration/e2e.ts | 23 ------------
src/lib/programs/posthog-integration/index.ts | 5 ---
src/lib/programs/program-step.ts | 7 ----
tsconfig.build.json | 1 +
tsconfig.json | 5 ++-
25 files changed, 88 insertions(+), 84 deletions(-)
rename {src/lib/ci-driver => e2e-harness}/ARCHITECTURE.md (87%)
rename {src/lib/ci-driver => e2e-harness}/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap (100%)
rename {src/lib/ci-driver => e2e-harness}/__tests__/e2e-flow-snapshot.test.ts (92%)
rename {src/lib/ci-driver => e2e-harness}/__tests__/recorder.test.ts (94%)
rename {src/lib/ci-driver => e2e-harness}/__tests__/wizard-ci-driver.test.ts (100%)
rename {src/lib/ci-driver => e2e-harness}/action-registry.ts (100%)
rename {src/lib/ci-driver => e2e-harness}/e2e-profile.ts (91%)
create mode 100644 e2e-harness/profiles.ts
rename {src/lib/ci-driver => e2e-harness}/recorder.ts (100%)
rename {src/lib/ci-driver => e2e-harness}/replay.ts (100%)
rename {src/lib/ci-driver => e2e-harness}/wizard-ci-driver.ts (100%)
rename {src/lib/ci-driver => e2e-harness}/wizard-ci-tools.ts (100%)
delete mode 100644 src/lib/programs/posthog-integration/e2e.ts
diff --git a/src/lib/ci-driver/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
similarity index 87%
rename from src/lib/ci-driver/ARCHITECTURE.md
rename to e2e-harness/ARCHITECTURE.md
index a3b3be3a..bdf80197 100644
--- a/src/lib/ci-driver/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -8,12 +8,17 @@ agent-only).
## The pieces
+This whole harness lives in `e2e-harness/` at the repo root — deliberately OUT
+of `src/` so none of it is part of the wizard's production source (nothing in
+`src/` imports it; the tsdown bundle never includes it).
+
```
-src/lib/ci-driver/
+e2e-harness/
wizard-ci-driver.ts WizardCiDriver — read_state / list_actions / perform_action
action-registry.ts screen → the actions legal on it (+ NO_ACTION_SCREENS)
wizard-ci-tools.ts in-process MCP server exposing the driver to an external loop
e2e-profile.ts WizardE2eProfile + decideE2eAction — the scripted walk policy
+ profiles.ts per-program profiles + profileFor(programId)
recorder.ts captures a run as key-moment frames
replay.ts reconstructs a frame's store and renders the real Ink screen
```
@@ -72,12 +77,12 @@ The harness emits a JSON result; assert on:
## Changing what the run does
-The UI choices live **on the program**, not in the harness — product knowledge
-stays out of infrastructure. Edit the program's e2e profile
-(`src/lib/programs/posthog-integration/e2e.ts`, wired via `ProgramConfig.e2e`,
-typed by `WizardE2eProfile`). The harness asks `decideE2eAction(state, profile)`
-what to commit on each screen. To make another program e2e-drivable, give it an
-`e2e` profile too.
+The per-program UI choices are product knowledge, but they live in the harness
+(`profiles.ts`, keyed by program id) — not on the program config — so this
+machinery stays out of the wizard's production source. Edit the program's entry
+in `profiles.ts` (typed by `WizardE2eProfile`). The harness asks
+`decideE2eAction(state, profile)` what to commit on each screen. To make another
+program e2e-drivable, add its profile to `profiles.ts`.
The flow is **snapshot-tested** offline (no agent, deterministic):
`__tests__/e2e-flow-snapshot.test.ts` golden-checks the (screen → decision)
diff --git a/src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap b/e2e-harness/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
similarity index 100%
rename from src/lib/ci-driver/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
rename to e2e-harness/__tests__/__snapshots__/e2e-flow-snapshot.test.ts.snap
diff --git a/src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts b/e2e-harness/__tests__/e2e-flow-snapshot.test.ts
similarity index 92%
rename from src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts
rename to e2e-harness/__tests__/e2e-flow-snapshot.test.ts
index b749ac03..ac549020 100644
--- a/src/lib/ci-driver/__tests__/e2e-flow-snapshot.test.ts
+++ b/e2e-harness/__tests__/e2e-flow-snapshot.test.ts
@@ -22,9 +22,9 @@ import { FRAMEWORK_REGISTRY } from '@lib/registry';
import { WizardReadiness } from '@lib/health-checks/readiness';
import { Program } from '@lib/programs/program-registry';
import { ScreenId } from '@ui/tui/router';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
import { WizardCiDriver } from '../wizard-ci-driver';
-import { decideE2eAction, DEFAULT_E2E_PROFILE } from '../e2e-profile';
+import { decideE2eAction } from '../e2e-profile';
+import { profileFor } from '../profiles';
/**
* Walk the program flow offline using its e2e profile, injecting the external
@@ -42,7 +42,7 @@ function traceFlow(
store.session = session;
const driver = new WizardCiDriver(store);
- const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+ const profile = profileFor(Program.PostHogIntegration);
const trace: Array<{ screen: string; action: string }> = [];
for (let guard = 0; guard < 40; guard++) {
@@ -82,7 +82,7 @@ describe('e2e flow snapshot — posthog-integration', () => {
it('Next.js (with a setup question) walks a stable path', () => {
expect({
program: 'posthog-integration',
- profile: posthogIntegrationConfig.e2e,
+ profile: profileFor(Program.PostHogIntegration),
trace: traceFlow(Integration.nextjs),
}).toMatchSnapshot();
});
diff --git a/src/lib/ci-driver/__tests__/recorder.test.ts b/e2e-harness/__tests__/recorder.test.ts
similarity index 94%
rename from src/lib/ci-driver/__tests__/recorder.test.ts
rename to e2e-harness/__tests__/recorder.test.ts
index ccc20cb4..747f7d60 100644
--- a/src/lib/ci-driver/__tests__/recorder.test.ts
+++ b/e2e-harness/__tests__/recorder.test.ts
@@ -12,9 +12,9 @@ import { FRAMEWORK_REGISTRY } from '@lib/registry';
import { WizardReadiness } from '@lib/health-checks/readiness';
import { Program } from '@lib/programs/program-registry';
import { ScreenId } from '@ui/tui/router';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
import { WizardCiDriver } from '../wizard-ci-driver';
-import { decideE2eAction, DEFAULT_E2E_PROFILE } from '../e2e-profile';
+import { decideE2eAction } from '../e2e-profile';
+import { profileFor } from '../profiles';
import { WizardRecorder } from '../recorder';
function recordedRun() {
@@ -34,7 +34,7 @@ function recordedRun() {
rec.start();
const driver = new WizardCiDriver(store);
- const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+ const profile = profileFor(Program.PostHogIntegration);
for (let i = 0; i < 40; i++) {
const state = driver.readState();
const d = decideE2eAction(state, profile);
diff --git a/src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts b/e2e-harness/__tests__/wizard-ci-driver.test.ts
similarity index 100%
rename from src/lib/ci-driver/__tests__/wizard-ci-driver.test.ts
rename to e2e-harness/__tests__/wizard-ci-driver.test.ts
diff --git a/src/lib/ci-driver/action-registry.ts b/e2e-harness/action-registry.ts
similarity index 100%
rename from src/lib/ci-driver/action-registry.ts
rename to e2e-harness/action-registry.ts
diff --git a/src/lib/ci-driver/e2e-profile.ts b/e2e-harness/e2e-profile.ts
similarity index 91%
rename from src/lib/ci-driver/e2e-profile.ts
rename to e2e-harness/e2e-profile.ts
index 1551c65a..f1f3a7aa 100644
--- a/src/lib/ci-driver/e2e-profile.ts
+++ b/e2e-harness/e2e-profile.ts
@@ -2,11 +2,12 @@
* WizardE2eProfile — a program's declarative e2e "test definition": the
* UI choices a headless e2e run should make at each decision point.
*
- * This is the test format the design discipline wants: the *choices* are
- * product knowledge about a program's flow, so they live on the program's
- * `ProgramConfig.e2e`, not hardcoded in the harness. The harness is generic —
- * it reads the profile and asks {@link decideE2eAction} what to commit on the
- * current screen. Add a profile to a program to make it e2e-drivable.
+ * The *choices* are product knowledge about a program's flow, but they live in
+ * the harness ({@link ./profiles}, keyed by program id) rather than on the
+ * program config — so none of this e2e machinery reaches the wizard's
+ * production source. The harness is generic: it reads a profile and asks
+ * {@link decideE2eAction} what to commit on the current screen. Add a program's
+ * profile to {@link ./profiles} to make it e2e-drivable.
*/
import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
diff --git a/e2e-harness/profiles.ts b/e2e-harness/profiles.ts
new file mode 100644
index 00000000..48056194
--- /dev/null
+++ b/e2e-harness/profiles.ts
@@ -0,0 +1,36 @@
+/**
+ * Per-program e2e profiles — the UI choices a headless run makes driving each
+ * program's flow. These are product knowledge about the flows, but they live in
+ * the test harness (NOT on the program config) so none of this e2e machinery
+ * reaches the wizard's production source. Look one up with {@link profileFor}.
+ */
+
+import { Program, type ProgramId } from '@lib/programs/program-registry';
+import { DEFAULT_E2E_PROFILE, type WizardE2eProfile } from './e2e-profile.js';
+
+/**
+ * PostHog integration happy path: confirm the intro, push past any health-check
+ * issue, pick the first setup option, skip MCP + Slack, delete installed skills.
+ */
+const POSTHOG_INTEGRATION_PROFILE: WizardE2eProfile = {
+ setup: 'first',
+ healthCheck: 'dismiss',
+ mcp: 'skip',
+ slack: 'skip',
+ skills: 'delete',
+ ask: 'first',
+};
+
+const PROFILES: Partial> = {
+ [Program.PostHogIntegration]: POSTHOG_INTEGRATION_PROFILE,
+};
+
+/** The e2e profile for a program, or the happy-path default if none is set. */
+export function profileFor(program: ProgramId): WizardE2eProfile {
+ return PROFILES[program] ?? DEFAULT_E2E_PROFILE;
+}
+
+/** Whether a program has an explicit (non-default) e2e profile. */
+export function hasProfile(program: ProgramId): boolean {
+ return program in PROFILES;
+}
diff --git a/src/lib/ci-driver/recorder.ts b/e2e-harness/recorder.ts
similarity index 100%
rename from src/lib/ci-driver/recorder.ts
rename to e2e-harness/recorder.ts
diff --git a/src/lib/ci-driver/replay.ts b/e2e-harness/replay.ts
similarity index 100%
rename from src/lib/ci-driver/replay.ts
rename to e2e-harness/replay.ts
diff --git a/src/lib/ci-driver/wizard-ci-driver.ts b/e2e-harness/wizard-ci-driver.ts
similarity index 100%
rename from src/lib/ci-driver/wizard-ci-driver.ts
rename to e2e-harness/wizard-ci-driver.ts
diff --git a/src/lib/ci-driver/wizard-ci-tools.ts b/e2e-harness/wizard-ci-tools.ts
similarity index 100%
rename from src/lib/ci-driver/wizard-ci-tools.ts
rename to e2e-harness/wizard-ci-tools.ts
diff --git a/package.json b/package.json
index c348b742..450253e0 100644
--- a/package.json
+++ b/package.json
@@ -158,6 +158,7 @@
"^ink$": "/__mocks__/ink.ts",
"^@env$": "/src/env.ts",
"^@lib/(.*)$": "/src/lib/$1",
+ "^@e2e-harness/(.*)$": "/e2e-harness/$1",
"^@utils/(.*)$": "/src/utils/$1",
"^@ui$": "/src/ui/index.ts",
"^@ui/(.*)$": "/src/ui/$1",
diff --git a/scripts/README.md b/scripts/README.md
index 38dc8a7d..7bcddd09 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -37,11 +37,11 @@ Real `--e2e` runs also drop a recording at
## Background
-The control plane lives in [`src/lib/ci-driver/`](../src/lib/ci-driver/) —
-`WizardCiDriver` (read/act over the store), the screen→action registry, the
-`wizard-ci-tools` MCP server, the e2e profile, and the recorder/replay. See
-[`ARCHITECTURE.md`](../src/lib/ci-driver/ARCHITECTURE.md) for how an agent
-drives these (env strip, scoped project id, gotchas).
+The control plane lives in [`e2e-harness/`](../e2e-harness/) — out of `src/`, so
+none of it ships in prod. `WizardCiDriver` (read/act over the store), the
+screen→action registry, the `wizard-ci-tools` MCP server, the e2e profiles, and
+the recorder/replay. See [`ARCHITECTURE.md`](../e2e-harness/ARCHITECTURE.md) for
+how an agent drives these (env strip, scoped project id, gotchas).
> **Security-leak repro scripts** (`relay-prod.no-jest.ts`,
> `precedence.no-jest.ts`) that reproduce the `ANTHROPIC_BASE_URL`
diff --git a/scripts/ci-driver-demo.ts b/scripts/ci-driver-demo.ts
index a2575289..4ffe06d5 100644
--- a/scripts/ci-driver-demo.ts
+++ b/scripts/ci-driver-demo.ts
@@ -21,7 +21,7 @@ import { buildSession, RunPhase } from '@lib/wizard-session';
import { Program } from '@lib/programs/program-registry';
import { WizardReadiness } from '@lib/health-checks/readiness';
import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
-import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
function makeOneFileProject(): string {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'wizard-ci-'));
diff --git a/scripts/ci-driver-live-agent.ts b/scripts/ci-driver-live-agent.ts
index 94be6903..c65c2222 100644
--- a/scripts/ci-driver-live-agent.ts
+++ b/scripts/ci-driver-live-agent.ts
@@ -18,11 +18,11 @@ import { setUI } from '@ui/index';
import { buildSession } from '@lib/wizard-session';
import { buildAgentEnv } from '@lib/agent/agent-interface';
import { Program } from '@lib/programs/program-registry';
-import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
import {
createWizardCiToolsServer,
CI_TOOL_NAMES,
-} from '@lib/ci-driver/wizard-ci-tools';
+} from '@e2e-harness/wizard-ci-tools';
const GATEWAY_URL = 'https://gateway.us.posthog.com/wizard';
const MODEL = 'claude-haiku-4-5-20251001';
diff --git a/scripts/e2e-full-run.no-jest.ts b/scripts/e2e-full-run.no-jest.ts
index 3bbea84a..2f00d6ae 100644
--- a/scripts/e2e-full-run.no-jest.ts
+++ b/scripts/e2e-full-run.no-jest.ts
@@ -20,16 +20,16 @@ import { InkUI } from '@ui/tui/ink-ui';
import { setUI } from '@ui/index';
import { buildSession, RunPhase } from '@lib/wizard-session';
import { Program } from '@lib/programs/program-registry';
-import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
import { runAgent } from '@lib/agent/agent-runner';
import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
import type { ScreenName } from '@ui/tui/router';
import {
decideE2eAction,
- DEFAULT_E2E_PROFILE,
type WizardE2eProfile,
-} from '@lib/ci-driver/e2e-profile';
-import { WizardRecorder } from '@lib/ci-driver/recorder';
+} from '@e2e-harness/e2e-profile';
+import { profileFor } from '@e2e-harness/profiles';
+import { WizardRecorder } from '@e2e-harness/recorder';
const log = (m: string) => process.stdout.write(`[e2e] ${m}\n`);
@@ -95,12 +95,11 @@ async function main() {
const driver = new WizardCiDriver(store);
- // The program OWNS its e2e UI choices (ProgramConfig.e2e). The harness is
- // generic: it asks decideE2eAction what to commit on each screen. The
- // --keep-skills flag (E2E_KEEP_SKILLS) overrides the profile's skills policy.
+ // The harness owns the per-program e2e UI choices (profileFor). It asks
+ // decideE2eAction what to commit on each screen. The --keep-skills flag
+ // (E2E_KEEP_SKILLS) overrides the profile's skills policy.
const profile: WizardE2eProfile = {
- ...DEFAULT_E2E_PROFILE,
- ...(posthogIntegrationConfig.e2e ?? {}),
+ ...profileFor(Program.PostHogIntegration),
...(keepSkills ? { skills: 'keep' as const } : {}),
};
log(`e2e profile: ${JSON.stringify(profile)}`);
diff --git a/scripts/record-demo.no-jest.ts b/scripts/record-demo.no-jest.ts
index 4996a4a1..b268c270 100644
--- a/scripts/record-demo.no-jest.ts
+++ b/scripts/record-demo.no-jest.ts
@@ -17,13 +17,10 @@ import { FRAMEWORK_REGISTRY } from '@lib/registry';
import { WizardReadiness } from '@lib/health-checks/readiness';
import { Program } from '@lib/programs/program-registry';
import { ScreenId } from '@ui/tui/router';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
-import { WizardCiDriver } from '@lib/ci-driver/wizard-ci-driver';
-import {
- decideE2eAction,
- DEFAULT_E2E_PROFILE,
-} from '@lib/ci-driver/e2e-profile';
-import { WizardRecorder } from '@lib/ci-driver/recorder';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
+import { decideE2eAction } from '@e2e-harness/e2e-profile';
+import { profileFor } from '@e2e-harness/profiles';
+import { WizardRecorder } from '@e2e-harness/recorder';
const out = process.env.RECORDING_OUT ?? '/tmp/wizard-demo.recording.json';
@@ -43,7 +40,7 @@ const rec = new WizardRecorder(
rec.start();
const driver = new WizardCiDriver(store);
-const profile = posthogIntegrationConfig.e2e ?? DEFAULT_E2E_PROFILE;
+const profile = profileFor(Program.PostHogIntegration);
for (let i = 0; i < 40; i++) {
const state = driver.readState();
diff --git a/scripts/render-snapshots.no-jest.ts b/scripts/render-snapshots.no-jest.ts
index 401c036b..6706cc8d 100644
--- a/scripts/render-snapshots.no-jest.ts
+++ b/scripts/render-snapshots.no-jest.ts
@@ -14,7 +14,7 @@
import { mkdirSync, writeFileSync, rmSync } from 'fs';
import { join } from 'path';
import type { ProgramId } from '@ui/tui/router';
-import { loadRecording, renderFrame } from '@lib/ci-driver/replay';
+import { loadRecording, renderFrame } from '@e2e-harness/replay';
const [recordingPath, outDir] = process.argv.slice(2);
if (!recordingPath || !outDir) {
diff --git a/scripts/replay-e2e.no-jest.ts b/scripts/replay-e2e.no-jest.ts
index ff9935ec..f3420ab3 100644
--- a/scripts/replay-e2e.no-jest.ts
+++ b/scripts/replay-e2e.no-jest.ts
@@ -8,7 +8,7 @@
*/
import { createInterface } from 'readline';
import type { ProgramId } from '@ui/tui/router';
-import { loadRecording, renderFrame, frameHeader } from '@lib/ci-driver/replay';
+import { loadRecording, renderFrame, frameHeader } from '@e2e-harness/replay';
const ENTER_ALT = '\x1b[?1049h';
const LEAVE_ALT = '\x1b[?1049l';
diff --git a/src/lib/programs/posthog-integration/e2e.ts b/src/lib/programs/posthog-integration/e2e.ts
deleted file mode 100644
index 7affe7fc..00000000
--- a/src/lib/programs/posthog-integration/e2e.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * E2E test definition for the PostHog integration flow — the UI choices
- * `wizard-ci --e2e` makes when driving this program headlessly.
- *
- * Lives next to the program (not in the test harness) because the choices are
- * product knowledge about this flow. The harness reads it via
- * `ProgramConfig.e2e` and asks `decideE2eAction` what to commit on each screen.
- */
-
-import type { WizardE2eProfile } from '@lib/ci-driver/e2e-profile';
-
-/**
- * Happy path: confirm the intro, push past any health-check issue, pick the
- * first setup option, skip MCP + Slack, and delete the installed skills.
- */
-export const POSTHOG_INTEGRATION_E2E_PROFILE: WizardE2eProfile = {
- setup: 'first',
- healthCheck: 'dismiss',
- mcp: 'skip',
- slack: 'skip',
- skills: 'delete',
- ask: 'first',
-};
diff --git a/src/lib/programs/posthog-integration/index.ts b/src/lib/programs/posthog-integration/index.ts
index a1156390..8da51862 100644
--- a/src/lib/programs/posthog-integration/index.ts
+++ b/src/lib/programs/posthog-integration/index.ts
@@ -20,7 +20,6 @@ import { requestDeepLink } from '@utils/provisioning';
import { openTrackedLink, withUtm } from '@utils/links';
import type { CloudRegion } from '@utils/types';
import { POSTHOG_INTEGRATION_PROGRAM } from './steps.js';
-import { POSTHOG_INTEGRATION_E2E_PROFILE } from './e2e.js';
import { getContentBlocks } from './content/index.js';
import { buildCodingAgentPrompt } from './handoff.js';
@@ -51,10 +50,6 @@ export const posthogIntegrationConfig: ProgramConfig = {
steps: POSTHOG_INTEGRATION_PROGRAM,
getContentBlocks,
- // E2E test definition — the UI choices `wizard-ci --e2e` makes driving this
- // flow headlessly. Lives in ./e2e.ts (product knowledge, not harness logic).
- e2e: POSTHOG_INTEGRATION_E2E_PROFILE,
-
// Basic integration runs without structured user input; drop wizard_ask
// so the model can't pop modal prompts mid-run. The runner forwards this
// list to the general-purpose subagent as well, so dispatched subagents
diff --git a/src/lib/programs/program-step.ts b/src/lib/programs/program-step.ts
index ad0c6042..55a06132 100644
--- a/src/lib/programs/program-step.ts
+++ b/src/lib/programs/program-step.ts
@@ -257,13 +257,6 @@ export interface ProgramConfig {
* `ProgramCliSurface` for semantics.
*/
cli?: ProgramCliSurface;
- /**
- * E2E test definition: the UI choices a headless control-plane run
- * (`wizard-ci --e2e`) makes at each decision point of THIS program's flow.
- * Product knowledge about the flow lives here, not in the test harness.
- * Absent → the program isn't e2e-drivable yet. See `WizardE2eProfile`.
- */
- e2e?: import('@lib/ci-driver/e2e-profile').WizardE2eProfile;
}
/**
diff --git a/tsconfig.build.json b/tsconfig.build.json
index ee22a19d..8618277c 100644
--- a/tsconfig.build.json
+++ b/tsconfig.build.json
@@ -20,6 +20,7 @@
"paths": {
"@env": ["./src/env.ts"],
"@lib/*": ["./src/lib/*"],
+ "@e2e-harness/*": ["./e2e-harness/*"],
"@utils/*": ["./src/utils/*"],
"@ui": ["./src/ui/index.ts"],
"@ui/*": ["./src/ui/*"],
diff --git a/tsconfig.json b/tsconfig.json
index 882ce510..a54d5a08 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -19,11 +19,10 @@
"src/**/*",
"test/**/*",
"e2e-tests/**/*",
+ "e2e-harness/**/*",
"types/**/*"
],
- "exclude": [
- "e2e-tests/test-applications/**/*"
- ],
+ "exclude": ["e2e-tests/test-applications/**/*"],
"ts-node": {
"files": true
}
From e44fe55a84037b3dad314bbc6f4d17ed33e94d00 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:23:32 -0400
Subject: [PATCH 10/38] docs(e2e-harness): cross-link the workbench
visual-snapshots flow + env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
ARCHITECTURE.md now documents the wizard-ci-snapshots visual-regression flow
(real run → render → diff → side-by-side report) and the env it needs.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/ARCHITECTURE.md | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index bdf80197..5f62f8af 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -109,6 +109,19 @@ redacted `session`) or render specific frames to ANSI with `renderFrame()` from
`replay.ts`. The access token is redacted, so recordings are safe to share.
Code: `recorder.ts` (capture) + `replay.ts` (render).
+## Visual-regression snapshots (the workbench flow)
+
+[wizard-workbench](https://github.com/PostHog/wizard-workbench) drives this for
+real-run **visual regression**: `pnpm wizard-ci-snapshots` runs each CI-e2e test
+definition as a real `--e2e` run, renders every key-moment frame to a `.ans`
+snapshot (via `scripts/render-snapshots.no-jest.ts` → `replay.ts`), and diffs
+against a committed baseline, writing a side-by-side `report.html`. Run-to-run
+agent differences (e.g. a different task enqueued) are surfaced for a human to
+review, not asserted away. It needs `WIZARD_PATH` pointing at a checkout that
+has this `e2e-harness/`, plus the e2e env (`POSTHOG_PERSONAL_API_KEY`,
+`POSTHOG_WIZARD_PROJECT_ID`, `POSTHOG_REGION`). See
+`services/wizard-ci/snapshots.ts` there.
+
## Driving it as a true LLM loop (optional)
`wizard-ci-tools.ts` exposes `read_state` / `list_actions` / `perform_action` as
From 2a325ef118267507d6b14995cee01032d48889c9 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:37:20 -0400
Subject: [PATCH 11/38] docs(posthog-integration): describe the e2e test path
next to the program
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A test/ README documents this program's e2e test definition — the path the
headless run walks and the option it auto-takes at each screen (confirm intro,
dismiss outage, first setup option, skip mcp/slack, delete skills). It's the
human description; the runnable profile stays in e2e-harness/profiles.ts. No e2e
machinery returns to prod src — this is documentation only.
Co-Authored-By: Claude Opus 4.8
---
.../posthog-integration/test/README.md | 44 +++++++++++++++++++
1 file changed, 44 insertions(+)
create mode 100644 src/lib/programs/posthog-integration/test/README.md
diff --git a/src/lib/programs/posthog-integration/test/README.md b/src/lib/programs/posthog-integration/test/README.md
new file mode 100644
index 00000000..d709da08
--- /dev/null
+++ b/src/lib/programs/posthog-integration/test/README.md
@@ -0,0 +1,44 @@
+# PostHog Integration — e2e test path
+
+The **test definition** for this program: the path a headless e2e run walks and
+the option it auto-takes at each decision point. This is the human description;
+the runnable form is the profile in
+[`e2e-harness/profiles.ts`](../../../../../e2e-harness/profiles.ts)
+(`profileFor(Program.PostHogIntegration)`), driven by `decideE2eAction`. Keep
+the two in sync — change the path here, change the profile there.
+
+The profile that produces this path:
+
+```ts
+{ setup: 'first', healthCheck: 'dismiss', mcp: 'skip', slack: 'skip', skills: 'delete', ask: 'first' }
+```
+
+## The path
+
+A run walks these screens in order; `(external)` means the harness waits while
+the runner or the agent advances on its own.
+
+| # | screen | auto-decision | why |
+| --- | --------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| 1 | `intro` | **confirm & continue** (`confirm_setup`) | start the flow |
+| 2 | `health-check` | **dismiss any outage** (`dismiss_outage`) | `healthCheck: 'dismiss'` — proceed even if the readiness probe flags an issue |
+| 3 | `setup` | **pick the first option** (`choose`) | `setup: 'first'` — only appears when the framework needs disambiguation (e.g. Next.js App vs Pages router); Node/Express has no setup question |
+| 4 | `auth` | _(external)_ | the runner injects credentials; harness waits |
+| 5 | `run` | _(external)_ | the real agent integrates the SDK + instruments events; harness waits |
+| 6 | `outro` | **dismiss** (`dismiss_outro`) | leave the summary |
+| 7 | `mcp` | **skip** (`set_mcp_outcome: skipped`) | `mcp: 'skip'` — don't install the MCP server |
+| 8 | `slack-connect` | **skip** (`dismiss_slack`) | `slack: 'skip'` |
+| 9 | `keep-skills` | **delete** (`keep_skills: kept=false`) | `skills: 'delete'` — leave nothing behind. Terminal: this is the run's done-signal |
+
+Two more the happy path doesn't normally hit, but the profile covers:
+
+- `mcp-suggested-prompts` → **dismiss** (only if the MCP step surfaces it).
+- `wizard_ask` overlay → **answer with the first option** (`ask: 'first'`). The
+ integration flow disallows `wizard_ask`, so this shouldn't fire here.
+
+## Adding a variant
+
+To test a different path (e.g. keep skills, or install MCP), pass a flag the
+harness understands (`--keep-skills` overrides `skills`), or add another profile
+to `e2e-harness/profiles.ts` and select it. Don't add e2e machinery back into
+this program — the path is described here, the execution lives in the harness.
From e7484ee2492ab4fe45f3a97838864663ba5343d6 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:41:27 -0400
Subject: [PATCH 12/38] refactor(e2e): make the test definition a readable JSON
the harness loads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Each program declares its e2e test path as src/lib/programs//test/e2e.json
— a `profile` (the options the headless run auto-takes) plus a documented `path`
of every screen. The harness imports the `profile` in e2e-harness/profiles.ts
(single source of truth, no prose duplication). Matches the repo's existing
JSON-data pattern (mcp-role-prompts.copy.json); resolveJsonModule already on.
It's data, imported only by the harness — zero prod imports, absent from the
tsdown bundle. Full harness suite + runtime load verified.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/profiles.ts | 28 ++++-----
.../posthog-integration/test/README.md | 61 ++++++-------------
.../posthog-integration/test/e2e.json | 35 +++++++++++
3 files changed, 63 insertions(+), 61 deletions(-)
create mode 100644 src/lib/programs/posthog-integration/test/e2e.json
diff --git a/e2e-harness/profiles.ts b/e2e-harness/profiles.ts
index 48056194..9cc8e301 100644
--- a/e2e-harness/profiles.ts
+++ b/e2e-harness/profiles.ts
@@ -1,28 +1,22 @@
/**
* Per-program e2e profiles — the UI choices a headless run makes driving each
- * program's flow. These are product knowledge about the flows, but they live in
- * the test harness (NOT on the program config) so none of this e2e machinery
- * reaches the wizard's production source. Look one up with {@link profileFor}.
+ * program's flow.
+ *
+ * Each program declares its test path as a readable JSON next to the program
+ * (`src/lib/programs//test/e2e.json`): a `profile` (the options the run
+ * auto-takes) plus a documented `path`. We load the `profile` here and map it by
+ * program id. Those JSONs are *data*, imported only by this harness — never by
+ * prod code — so they don't reach the wizard's production source or its bundle.
+ * Look one up with {@link profileFor}.
*/
import { Program, type ProgramId } from '@lib/programs/program-registry';
import { DEFAULT_E2E_PROFILE, type WizardE2eProfile } from './e2e-profile.js';
-
-/**
- * PostHog integration happy path: confirm the intro, push past any health-check
- * issue, pick the first setup option, skip MCP + Slack, delete installed skills.
- */
-const POSTHOG_INTEGRATION_PROFILE: WizardE2eProfile = {
- setup: 'first',
- healthCheck: 'dismiss',
- mcp: 'skip',
- slack: 'skip',
- skills: 'delete',
- ask: 'first',
-};
+import posthogIntegrationE2e from '@lib/programs/posthog-integration/test/e2e.json';
const PROFILES: Partial> = {
- [Program.PostHogIntegration]: POSTHOG_INTEGRATION_PROFILE,
+ [Program.PostHogIntegration]:
+ posthogIntegrationE2e.profile as WizardE2eProfile,
};
/** The e2e profile for a program, or the happy-path default if none is set. */
diff --git a/src/lib/programs/posthog-integration/test/README.md b/src/lib/programs/posthog-integration/test/README.md
index d709da08..34a008a6 100644
--- a/src/lib/programs/posthog-integration/test/README.md
+++ b/src/lib/programs/posthog-integration/test/README.md
@@ -1,44 +1,17 @@
-# PostHog Integration — e2e test path
-
-The **test definition** for this program: the path a headless e2e run walks and
-the option it auto-takes at each decision point. This is the human description;
-the runnable form is the profile in
-[`e2e-harness/profiles.ts`](../../../../../e2e-harness/profiles.ts)
-(`profileFor(Program.PostHogIntegration)`), driven by `decideE2eAction`. Keep
-the two in sync — change the path here, change the profile there.
-
-The profile that produces this path:
-
-```ts
-{ setup: 'first', healthCheck: 'dismiss', mcp: 'skip', slack: 'skip', skills: 'delete', ask: 'first' }
-```
-
-## The path
-
-A run walks these screens in order; `(external)` means the harness waits while
-the runner or the agent advances on its own.
-
-| # | screen | auto-decision | why |
-| --- | --------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| 1 | `intro` | **confirm & continue** (`confirm_setup`) | start the flow |
-| 2 | `health-check` | **dismiss any outage** (`dismiss_outage`) | `healthCheck: 'dismiss'` — proceed even if the readiness probe flags an issue |
-| 3 | `setup` | **pick the first option** (`choose`) | `setup: 'first'` — only appears when the framework needs disambiguation (e.g. Next.js App vs Pages router); Node/Express has no setup question |
-| 4 | `auth` | _(external)_ | the runner injects credentials; harness waits |
-| 5 | `run` | _(external)_ | the real agent integrates the SDK + instruments events; harness waits |
-| 6 | `outro` | **dismiss** (`dismiss_outro`) | leave the summary |
-| 7 | `mcp` | **skip** (`set_mcp_outcome: skipped`) | `mcp: 'skip'` — don't install the MCP server |
-| 8 | `slack-connect` | **skip** (`dismiss_slack`) | `slack: 'skip'` |
-| 9 | `keep-skills` | **delete** (`keep_skills: kept=false`) | `skills: 'delete'` — leave nothing behind. Terminal: this is the run's done-signal |
-
-Two more the happy path doesn't normally hit, but the profile covers:
-
-- `mcp-suggested-prompts` → **dismiss** (only if the MCP step surfaces it).
-- `wizard_ask` overlay → **answer with the first option** (`ask: 'first'`). The
- integration flow disallows `wizard_ask`, so this shouldn't fire here.
-
-## Adding a variant
-
-To test a different path (e.g. keep skills, or install MCP), pass a flag the
-harness understands (`--keep-skills` overrides `skills`), or add another profile
-to `e2e-harness/profiles.ts` and select it. Don't add e2e machinery back into
-this program — the path is described here, the execution lives in the harness.
+# PostHog Integration — e2e test definition
+
+[`e2e.json`](e2e.json) is this program's **test definition**: the options a
+headless e2e run auto-takes at each decision point of the flow, plus a
+documented `path` of every screen and what it does.
+
+- **`profile`** — the machine-read part. The harness loads it via
+ `profileFor(Program.PostHogIntegration)`
+ ([`e2e-harness/profiles.ts`](../../../../../e2e-harness/profiles.ts)) and asks
+ `decideE2eAction` what to commit on each screen.
+- **`path`** — the human-read part: each screen in order and the auto-decision,
+ so you can see the whole walk at a glance.
+
+It's **data, not code** — imported only by the harness, never by prod, so it
+doesn't ship in the bundle. To change the test path, edit `e2e.json`. To add a
+new program's test path, drop an `e2e.json` in its own `test/` folder and map it
+in `profiles.ts`.
diff --git a/src/lib/programs/posthog-integration/test/e2e.json b/src/lib/programs/posthog-integration/test/e2e.json
new file mode 100644
index 00000000..946435c9
--- /dev/null
+++ b/src/lib/programs/posthog-integration/test/e2e.json
@@ -0,0 +1,35 @@
+{
+ "program": "posthog-integration",
+ "summary": "Happy path: confirm intro, push past health issues, take the first setup option, skip MCP + Slack, delete installed skills.",
+ "profile": {
+ "setup": "first",
+ "healthCheck": "dismiss",
+ "mcp": "skip",
+ "slack": "skip",
+ "skills": "delete",
+ "ask": "first"
+ },
+ "path": [
+ { "screen": "intro", "auto": "confirm & continue" },
+ {
+ "screen": "health-check",
+ "auto": "dismiss outage — proceed even if the readiness probe flags an issue"
+ },
+ {
+ "screen": "setup",
+ "auto": "pick the first option — only appears when the framework needs disambiguation (e.g. Next.js router); Node/Express has none"
+ },
+ { "screen": "auth", "auto": "(external) — the runner injects credentials" },
+ {
+ "screen": "run",
+ "auto": "(external) — the real agent integrates the SDK + instruments events"
+ },
+ { "screen": "outro", "auto": "dismiss" },
+ { "screen": "mcp", "auto": "skip — don't install the MCP server" },
+ { "screen": "slack-connect", "auto": "skip" },
+ {
+ "screen": "keep-skills",
+ "auto": "delete — leave nothing behind (terminal: the run's done-signal)"
+ }
+ ]
+}
From 18853dd7988167a29f2ed98ad5ea1142f4270d02 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:53:55 -0400
Subject: [PATCH 13/38] docs(e2e-harness): instrument the perform_action trace
across the hops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add the end-to-end trace (agent → perform_action → driver → action-registry →
store.completeSetup → emitChange → router re-resolve → readState) as a comment at
the perform_action tool, with cross-referenced breadcrumbs at the driver hop
(one committed mutation per call) and the action-registry hop (the store setter +
flag-flip the screen sequence reacts to). Harness-only; prod store.ts untouched.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/action-registry.ts | 6 ++++++
e2e-harness/wizard-ci-driver.ts | 11 +++++++++--
e2e-harness/wizard-ci-tools.ts | 13 +++++++++++++
3 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/e2e-harness/action-registry.ts b/e2e-harness/action-registry.ts
index cde6315a..db58b8ce 100644
--- a/e2e-harness/action-registry.ts
+++ b/e2e-harness/action-registry.ts
@@ -77,6 +77,12 @@ export const NO_ACTION_SCREENS: ReadonlySet = new Set([
* Intro-style screens whose only action is "confirm and continue", committing
* the same `setupConfirmed` flag the IntroScreen sets. Several programs reuse
* this shape, so they share one action via this helper.
+ *
+ * This is the store hop of the trace in {@link ./wizard-ci-tools} (perform_action):
+ * `apply` calls `store.completeSetup()`, which does
+ * `$session.setKey('setupConfirmed', true)` + `emitChange()`. `router.resolve`
+ * then treats the intro as complete and renders the next screen — no imperative
+ * navigation, just a flag flip the screen sequence reacts to.
*/
const confirmSetupAction: DriverAction = {
id: 'confirm_setup',
diff --git a/e2e-harness/wizard-ci-driver.ts b/e2e-harness/wizard-ci-driver.ts
index cfa85a11..4fb27a2b 100644
--- a/e2e-harness/wizard-ci-driver.ts
+++ b/e2e-harness/wizard-ci-driver.ts
@@ -136,16 +136,23 @@ export class WizardCiDriver {
* Apply a named action via its store setter, then return the next state.
* Throws UnknownActionError if the action isn't legal on the current screen,
* or MissingParamError if a required param is absent.
+ *
+ * The middle hop of the trace in {@link ./wizard-ci-tools} (perform_action):
+ * exactly one store setter fires per call (via `action.apply`), `emitChange()`
+ * bumps `$version`, `router.resolve` re-derives the screen, and the fresh
+ * `readState()` reflects it.
*/
performAction(
actionId: string,
params: Record = {},
): CiState {
const screen = this.store.currentScreen;
+ // Resolve the action against the CURRENT screen's registry entry, so a
+ // caller can't commit something illegal for where the flow actually is.
const action = actionsForScreen(screen).find((a) => a.id === actionId);
if (!action) throw new UnknownActionError(actionId, screen);
- action.apply(this.store, params); // may throw MissingParamError
- return this.readState();
+ action.apply(this.store, params); // the single committed mutation
+ return this.readState(); // next state, screen already re-derived
}
/**
diff --git a/e2e-harness/wizard-ci-tools.ts b/e2e-harness/wizard-ci-tools.ts
index f0d6db09..61fd0295 100644
--- a/e2e-harness/wizard-ci-tools.ts
+++ b/e2e-harness/wizard-ci-tools.ts
@@ -68,6 +68,19 @@ export async function createWizardCiToolsServer(
}),
);
+ // End to end, one perform_action call lands as a single committed store
+ // mutation and re-derives the rendered screen:
+ //
+ // agent → mcp__wizard-ci-tools__perform_action {action:"confirm_setup"}
+ // → driver.performAction("confirm_setup", {})
+ // → actionsForScreen("intro") finds confirm_setup
+ // → apply → store.completeSetup()
+ // → $session.setKey("setupConfirmed", true); emitChange()
+ // → $version 0→1 → router.resolve(session) now skips intro
+ // (isComplete) → returns "health-check"
+ // → driver.readState() → { currentScreen:"health-check",
+ // actions:[dismiss_outage], … }
+ // returned to the agent, which calls read_state and picks the next action.
const performAction = tool(
'perform_action',
'Commit a decision by invoking a legal action for the current screen ' +
From fcb2e548b98648cde51f3d4d89077bf8782513b0 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 16:56:10 -0400
Subject: [PATCH 14/38] docs(e2e-harness): state the never-ships-to-prod
guarantee in each module
Add a header note to wizard-ci-tools / wizard-ci-driver / action-registry /
recorder / replay: each lives in e2e-harness/, is imported only by scripts/tests,
and is absent from the tsdown bundle (bin.ts is the only entry). Addresses the
"this looks shippable" worry right where a reader meets the code (esp. the MCP
server + SDK import). Verified: no e2e symbols in dist/.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/action-registry.ts | 4 ++++
e2e-harness/recorder.ts | 4 ++++
e2e-harness/replay.ts | 4 ++++
e2e-harness/wizard-ci-driver.ts | 4 ++++
e2e-harness/wizard-ci-tools.ts | 4 ++++
5 files changed, 20 insertions(+)
diff --git a/e2e-harness/action-registry.ts b/e2e-harness/action-registry.ts
index db58b8ce..dbce9548 100644
--- a/e2e-harness/action-registry.ts
+++ b/e2e-harness/action-registry.ts
@@ -10,6 +10,10 @@
* Discipline mirrors screen-registry.tsx: one entry per screen, kept exhaustive
* by a test over the ScreenId/Overlay enums. No product knowledge leaks in —
* actions speak only in store setters and generic params.
+ *
+ * Never ships to prod: the `apply` closures call prod store setters, but this
+ * registry lives in e2e-harness/ and is imported only by the driver/tests — no
+ * production code references it, so it never reaches the tsdown bundle.
*/
import type { WizardStore } from '@ui/tui/store';
diff --git a/e2e-harness/recorder.ts b/e2e-harness/recorder.ts
index 6e65c190..fdc40c9a 100644
--- a/e2e-harness/recorder.ts
+++ b/e2e-harness/recorder.ts
@@ -13,6 +13,10 @@
* Each frame stores the (secret-redacted) session plus tasks/status/event-plan,
* which is enough for {@link ../replay} to reconstruct a throwaway store and
* render the real Ink screen back to ANSI.
+ *
+ * Never ships to prod: it subscribes to a real store, but it lives in
+ * e2e-harness/ and no production code imports it — the live run never records
+ * itself, and it is absent from the tsdown bundle (`bin.ts` is the only entry).
*/
import type { WizardStore } from '@ui/tui/store';
diff --git a/e2e-harness/replay.ts b/e2e-harness/replay.ts
index d5729413..4a556391 100644
--- a/e2e-harness/replay.ts
+++ b/e2e-harness/replay.ts
@@ -7,6 +7,10 @@
* Rendering is offline against a disposable store, so screen effects (detection,
* prefetch) fire harmlessly against the recorded state and never touch the real
* run.
+ *
+ * Never ships to prod: it imports real screen components to render them, but it
+ * lives in e2e-harness/ and only scripts/tests import it — no production code
+ * does, so it is absent from the tsdown bundle (`bin.ts` is the only entry).
*/
import { readFileSync } from 'fs';
diff --git a/e2e-harness/wizard-ci-driver.ts b/e2e-harness/wizard-ci-driver.ts
index 4fb27a2b..04c34e9f 100644
--- a/e2e-harness/wizard-ci-driver.ts
+++ b/e2e-harness/wizard-ci-driver.ts
@@ -16,6 +16,10 @@
* state (typed-but-unsubmitted text, highlighted option, the wizard_ask
* per-question accumulator) is React-local and deliberately invisible here —
* the driver issues the final commit directly instead.
+ *
+ * Never ships to prod: it reads/mutates the store via the same setters the UI
+ * uses, but it lives in e2e-harness/ and no production code imports it, so it is
+ * absent from the tsdown bundle (`bin.ts` is the only build entry).
*/
import type { WizardStore } from '@ui/tui/store';
diff --git a/e2e-harness/wizard-ci-tools.ts b/e2e-harness/wizard-ci-tools.ts
index 61fd0295..e2bb7046 100644
--- a/e2e-harness/wizard-ci-tools.ts
+++ b/e2e-harness/wizard-ci-tools.ts
@@ -12,6 +12,10 @@
* Mirrors wizard-tools.ts: pure adapter behind a seam (the driver), importing
* no product knowledge. The driver does the work; this just speaks MCP. The
* SDK is dynamically imported so this module loads even where the SDK is mocked.
+ *
+ * Never ships to prod: despite building an MCP server and importing the agent
+ * SDK, this lives in e2e-harness/ and is never imported by production code —
+ * `bin.ts` is the only tsdown entry, so it is absent from the published bundle.
*/
import { z } from 'zod';
From cb439ff4761918bd2dfcfb3e0dc37a0558aa0857 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 17:01:36 -0400
Subject: [PATCH 15/38] revert: drop the explanatory comments from source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Moving the trace / never-ships / credentials notes to PR review comments anchored
to the lines instead — keep the source uncluttered.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/action-registry.ts | 10 ----------
e2e-harness/recorder.ts | 4 ----
e2e-harness/replay.ts | 4 ----
e2e-harness/wizard-ci-driver.ts | 15 ++-------------
e2e-harness/wizard-ci-tools.ts | 17 -----------------
5 files changed, 2 insertions(+), 48 deletions(-)
diff --git a/e2e-harness/action-registry.ts b/e2e-harness/action-registry.ts
index dbce9548..cde6315a 100644
--- a/e2e-harness/action-registry.ts
+++ b/e2e-harness/action-registry.ts
@@ -10,10 +10,6 @@
* Discipline mirrors screen-registry.tsx: one entry per screen, kept exhaustive
* by a test over the ScreenId/Overlay enums. No product knowledge leaks in —
* actions speak only in store setters and generic params.
- *
- * Never ships to prod: the `apply` closures call prod store setters, but this
- * registry lives in e2e-harness/ and is imported only by the driver/tests — no
- * production code references it, so it never reaches the tsdown bundle.
*/
import type { WizardStore } from '@ui/tui/store';
@@ -81,12 +77,6 @@ export const NO_ACTION_SCREENS: ReadonlySet = new Set([
* Intro-style screens whose only action is "confirm and continue", committing
* the same `setupConfirmed` flag the IntroScreen sets. Several programs reuse
* this shape, so they share one action via this helper.
- *
- * This is the store hop of the trace in {@link ./wizard-ci-tools} (perform_action):
- * `apply` calls `store.completeSetup()`, which does
- * `$session.setKey('setupConfirmed', true)` + `emitChange()`. `router.resolve`
- * then treats the intro as complete and renders the next screen — no imperative
- * navigation, just a flag flip the screen sequence reacts to.
*/
const confirmSetupAction: DriverAction = {
id: 'confirm_setup',
diff --git a/e2e-harness/recorder.ts b/e2e-harness/recorder.ts
index fdc40c9a..6e65c190 100644
--- a/e2e-harness/recorder.ts
+++ b/e2e-harness/recorder.ts
@@ -13,10 +13,6 @@
* Each frame stores the (secret-redacted) session plus tasks/status/event-plan,
* which is enough for {@link ../replay} to reconstruct a throwaway store and
* render the real Ink screen back to ANSI.
- *
- * Never ships to prod: it subscribes to a real store, but it lives in
- * e2e-harness/ and no production code imports it — the live run never records
- * itself, and it is absent from the tsdown bundle (`bin.ts` is the only entry).
*/
import type { WizardStore } from '@ui/tui/store';
diff --git a/e2e-harness/replay.ts b/e2e-harness/replay.ts
index 4a556391..d5729413 100644
--- a/e2e-harness/replay.ts
+++ b/e2e-harness/replay.ts
@@ -7,10 +7,6 @@
* Rendering is offline against a disposable store, so screen effects (detection,
* prefetch) fire harmlessly against the recorded state and never touch the real
* run.
- *
- * Never ships to prod: it imports real screen components to render them, but it
- * lives in e2e-harness/ and only scripts/tests import it — no production code
- * does, so it is absent from the tsdown bundle (`bin.ts` is the only entry).
*/
import { readFileSync } from 'fs';
diff --git a/e2e-harness/wizard-ci-driver.ts b/e2e-harness/wizard-ci-driver.ts
index 04c34e9f..cfa85a11 100644
--- a/e2e-harness/wizard-ci-driver.ts
+++ b/e2e-harness/wizard-ci-driver.ts
@@ -16,10 +16,6 @@
* state (typed-but-unsubmitted text, highlighted option, the wizard_ask
* per-question accumulator) is React-local and deliberately invisible here —
* the driver issues the final commit directly instead.
- *
- * Never ships to prod: it reads/mutates the store via the same setters the UI
- * uses, but it lives in e2e-harness/ and no production code imports it, so it is
- * absent from the tsdown bundle (`bin.ts` is the only build entry).
*/
import type { WizardStore } from '@ui/tui/store';
@@ -140,23 +136,16 @@ export class WizardCiDriver {
* Apply a named action via its store setter, then return the next state.
* Throws UnknownActionError if the action isn't legal on the current screen,
* or MissingParamError if a required param is absent.
- *
- * The middle hop of the trace in {@link ./wizard-ci-tools} (perform_action):
- * exactly one store setter fires per call (via `action.apply`), `emitChange()`
- * bumps `$version`, `router.resolve` re-derives the screen, and the fresh
- * `readState()` reflects it.
*/
performAction(
actionId: string,
params: Record = {},
): CiState {
const screen = this.store.currentScreen;
- // Resolve the action against the CURRENT screen's registry entry, so a
- // caller can't commit something illegal for where the flow actually is.
const action = actionsForScreen(screen).find((a) => a.id === actionId);
if (!action) throw new UnknownActionError(actionId, screen);
- action.apply(this.store, params); // the single committed mutation
- return this.readState(); // next state, screen already re-derived
+ action.apply(this.store, params); // may throw MissingParamError
+ return this.readState();
}
/**
diff --git a/e2e-harness/wizard-ci-tools.ts b/e2e-harness/wizard-ci-tools.ts
index e2bb7046..f0d6db09 100644
--- a/e2e-harness/wizard-ci-tools.ts
+++ b/e2e-harness/wizard-ci-tools.ts
@@ -12,10 +12,6 @@
* Mirrors wizard-tools.ts: pure adapter behind a seam (the driver), importing
* no product knowledge. The driver does the work; this just speaks MCP. The
* SDK is dynamically imported so this module loads even where the SDK is mocked.
- *
- * Never ships to prod: despite building an MCP server and importing the agent
- * SDK, this lives in e2e-harness/ and is never imported by production code —
- * `bin.ts` is the only tsdown entry, so it is absent from the published bundle.
*/
import { z } from 'zod';
@@ -72,19 +68,6 @@ export async function createWizardCiToolsServer(
}),
);
- // End to end, one perform_action call lands as a single committed store
- // mutation and re-derives the rendered screen:
- //
- // agent → mcp__wizard-ci-tools__perform_action {action:"confirm_setup"}
- // → driver.performAction("confirm_setup", {})
- // → actionsForScreen("intro") finds confirm_setup
- // → apply → store.completeSetup()
- // → $session.setKey("setupConfirmed", true); emitChange()
- // → $version 0→1 → router.resolve(session) now skips intro
- // (isComplete) → returns "health-check"
- // → driver.readState() → { currentScreen:"health-check",
- // actions:[dismiss_outage], … }
- // returned to the agent, which calls read_state and picks the next action.
const performAction = tool(
'perform_action',
'Commit a decision by invoking a legal action for the current screen ' +
From 6e88d7daa1073d84f5fa185d165e96aabec91a82 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 17:27:17 -0400
Subject: [PATCH 16/38] chore(scripts): remove demo/proof scaffolding from the
PR
Drop the three scripts that were scaffolding while building, not part of the
shipped feature:
- ci-driver-demo.ts offline no-agent control-loop demo (covered by tests)
- ci-driver-live-agent.ts manual LLM-drives-MCP proof (needs a key)
- record-demo.no-jest.ts offline sample-recording generator (real --e2e records)
Keep the three the workbench actually orchestrates: e2e-full-run, render-snapshots,
replay-e2e. Update scripts/README.md + ARCHITECTURE.md accordingly.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/ARCHITECTURE.md | 3 +-
scripts/README.md | 42 +++-----
scripts/ci-driver-demo.ts | 138 --------------------------
scripts/ci-driver-live-agent.ts | 170 --------------------------------
scripts/record-demo.no-jest.ts | 78 ---------------
5 files changed, 15 insertions(+), 416 deletions(-)
delete mode 100644 scripts/ci-driver-demo.ts
delete mode 100644 scripts/ci-driver-live-agent.ts
delete mode 100644 scripts/record-demo.no-jest.ts
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index 5f62f8af..9ca36af1 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -97,8 +97,9 @@ store/router changes (a route, a task-list update, a status line, a runPhase
change, an overlay). Replay reconstructs each frame's store and renders the real
Ink screen back to the terminal, so a run can be watched back to verify it:
+A real `--e2e` run drops a recording at `/tmp/wizard-e2e-.recording.json`.
+
```bash
-npx tsx scripts/record-demo.no-jest.ts # sample, offline
npx tsx scripts/replay-e2e.no-jest.ts --step # Enter ▸ step
npx tsx scripts/replay-e2e.no-jest.ts --delay 1200 # auto-play
```
diff --git a/scripts/README.md b/scripts/README.md
index 7bcddd09..619a8343 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -2,38 +2,22 @@
Helper scripts. The build-related ones (`generate-version.cjs`,
`smoke-test*.sh`, `check-screens.tsx`) are wired into `package.json`. The rest
-below are **manual, runnable tools** for the `wizard-ci-tools` control plane and
-e2e — each is a standalone `tsx` entry, named `*.no-jest.ts` so Jest ignores it.
+below are **manual, runnable tools** for headless e2e + snapshots — each is a
+standalone `tsx` entry, named `*.no-jest.ts` so Jest ignores it.
Run from the repo root, e.g. `npx tsx scripts/.no-jest.ts`.
-## Control-plane e2e (drive the wizard headlessly via wizard-ci-tools)
-
-| Script | What it does | Needs |
-| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
-| **`ci-driver-demo.ts`** | Drives the real store/router/detection flow with `WizardCiDriver` — **offline, agent stubbed**. Proves the control loop on a 1-file project. | nothing |
-| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result (`E2E_RESULT_JSON`) and a recording (`E2E_RECORDING_JSON`). | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
-| **`ci-driver-live-agent.ts`** | A **real gateway LLM** drives the `wizard-ci-tools` MCP server (read_state / perform_action) to advance the wizard — agent-vs-agent proof. | `PHX_KEY_FILE` |
-
-> Normally you don't call these directly — `pnpm wizard-ci --e2e` (in
-> [wizard-workbench](https://github.com/PostHog/wizard-workbench)) orchestrates
-> `e2e-full-run` with the env hygiene + assertions.
-
-## Record & replay (view a run back in the terminal)
-
-| Script | What it does | Needs |
-| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- |
-| **`record-demo.no-jest.ts`** | Produces a sample recording **offline** (no agent, no network) by driving the flow with a `WizardRecorder`. Writes `/tmp/wizard-demo.recording.json` (override with `RECORDING_OUT`). | nothing |
-| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance, default) or `--delay ` (auto-play). | a `*.recording.json` |
-
-```bash
-# make a sample recording, then watch it
-npx tsx scripts/record-demo.no-jest.ts
-npx tsx scripts/replay-e2e.no-jest.ts /tmp/wizard-demo.recording.json --step
-```
-
-Real `--e2e` runs also drop a recording at
-`/tmp/wizard-e2e-.recording.json`.
+| Script | What it does | Needs |
+| --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
+| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result + a recording. | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
+| **`render-snapshots.no-jest.ts`** | Renders a recording's key-moment frames to per-frame `.ans` snapshots (real Ink → ANSI). Feeds the workbench visual-regression flow. | a `recording.json` + outDir |
+| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance) or `--delay ` (auto-play). | a `recording.json` |
+
+> You usually don't call these directly — `pnpm wizard-ci --e2e` and
+> `pnpm wizard-ci-snapshots` (in
+> [wizard-workbench](https://github.com/PostHog/wizard-workbench)) orchestrate
+> them with the env hygiene + assertions. A real `--e2e` run drops a recording
+> at `/tmp/wizard-e2e-.recording.json`.
## Background
diff --git a/scripts/ci-driver-demo.ts b/scripts/ci-driver-demo.ts
deleted file mode 100644
index 4ffe06d5..00000000
--- a/scripts/ci-driver-demo.ts
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * Headless control-plane demo — runs the real wizard store/router/detection
- * flow with NO terminal and NO browser, driven entirely by WizardCiDriver.
- *
- * This is the runnable (tsx) sibling of the jest control-plane test: it proves
- * the same loop works outside a test harness, against real framework detection
- * on a 1-file project. The agent + auth steps are injected (the agent is a
- * separate, token-burning concern proven elsewhere) so this stays fast and
- * offline; every human decision goes through the driver's read/act surface.
- *
- * POSTHOG_WIZARD_INSTALL_DIR= tsx scripts/ci-driver-demo.ts
- */
-
-import fs from 'fs';
-import os from 'os';
-import path from 'path';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession, RunPhase } from '@lib/wizard-session';
-import { Program } from '@lib/programs/program-registry';
-import { WizardReadiness } from '@lib/health-checks/readiness';
-import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-
-function makeOneFileProject(): string {
- const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'wizard-ci-'));
- // The one file: a package.json declaring Next.js. Framework detection keys
- // off this, so the integration program resolves to the Next.js flow.
- fs.writeFileSync(
- path.join(dir, 'package.json'),
- JSON.stringify(
- { name: 'demo', private: true, dependencies: { next: '15.3.0' } },
- null,
- 2,
- ),
- );
- return dir;
-}
-
-const log = (msg: string) => process.stdout.write(` ${msg}\n`);
-
-async function main() {
- const installDir =
- process.env.POSTHOG_WIZARD_INSTALL_DIR ?? makeOneFileProject();
- process.stdout.write(`\nHeadless wizard-ci-tools demo on: ${installDir}\n\n`);
-
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store)); // real UI, never rendered → no Ink, no browser
- const session = buildSession({ installDir, ci: true });
- store.session = session;
-
- const driver = new WizardCiDriver(store);
-
- // Run the program's onReady hooks — this is REAL framework detection.
- await store.runReadyHooks();
-
- const seen: ScreenName[] = [];
- const note = (s = driver.readState()) => {
- if (seen[seen.length - 1] !== s.currentScreen) {
- seen.push(s.currentScreen);
- log(`screen → ${s.currentScreen}`);
- }
- return s;
- };
-
- note();
- log(`detected framework: ${store.session.integration ?? '(none)'}`);
-
- // Walk the flow. Each branch commits exactly what a user would, via the
- // driver, and reads the resulting screen back.
- for (let i = 0; i < 30; i++) {
- const state = note();
- const screen = state.currentScreen;
-
- if (screen === ScreenId.Intro) {
- driver.performAction('confirm_setup');
- } else if (screen === ScreenId.HealthCheck) {
- // Simulate the readiness probe coming back clean (offline-safe).
- store.setReadinessResult({
- decision: WizardReadiness.Yes,
- health: {} as never,
- reasons: [],
- });
- } else if (screen === ScreenId.Setup) {
- const q = state.setupQuestions[0];
- log(`answering setup "${q.key}" → ${q.options[0].value}`);
- driver.performAction('choose', { key: q.key, value: q.options[0].value });
- } else if (screen === ScreenId.Auth) {
- // Inject credentials (the scoped phx key can't fetch project data; the
- // gateway-bearer path is proven separately). accessToken would be the
- // phx key in a real run.
- store.setCredentials({
- accessToken: 'phx_injected_for_demo',
- projectApiKey: 'phc_demo',
- host: 'https://us.posthog.com',
- projectId: 1,
- });
- } else if (screen === ScreenId.Run) {
- // Simulate the agent run completing (real agent proven via the gateway).
- store.setRunPhase(RunPhase.Running);
- store.setRunPhase(RunPhase.Completed);
- } else if (screen === ScreenId.Outro) {
- driver.performAction('dismiss_outro');
- } else if (screen === ScreenId.Mcp) {
- driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
- } else if (screen === ScreenId.McpSuggestedPrompts) {
- driver.performAction('dismiss');
- } else if (screen === ScreenId.SlackConnect) {
- driver.performAction('dismiss_slack');
- } else if (screen === ScreenId.KeepSkills) {
- driver.performAction('keep_skills', { kept: true });
- break; // terminal: skillsComplete is the run's done-signal
- } else if (screen === Overlay.WizardAsk) {
- // Not used by the integration program, but handle it generically.
- const q = state.pendingQuestion!.questions[0];
- driver.performAction('answer_question', {
- answers: { [q.id]: q.options?.[0]?.value ?? 'ok' },
- });
- } else {
- log(`no driver branch for "${screen}" — stopping`);
- break;
- }
- }
-
- const done = store.session.skillsComplete;
- process.stdout.write(
- `\n${done ? '✓' : '✗'} skillsComplete=${done} path: ${seen.join(
- ' → ',
- )}\n\n`,
- );
- process.exit(done ? 0 : 1);
-}
-
-main().catch((e) => {
- process.stderr.write(`\nDEMO_FAIL: ${e?.stack ?? e}\n`);
- process.exit(1);
-});
diff --git a/scripts/ci-driver-live-agent.ts b/scripts/ci-driver-live-agent.ts
deleted file mode 100644
index c65c2222..00000000
--- a/scripts/ci-driver-live-agent.ts
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * Live proof: a REAL gateway LLM drives the wizard-ci-tools MCP server.
- *
- * Configures the PostHog LLM gateway with the phx personal API key as bearer
- * (the same "creative hack" the CI auth path uses — no OAuth, no browser),
- * attaches the in-process wizard-ci-tools server to a real `query()`, and asks
- * the model to read the wizard's state and advance it. Success = the model
- * actually moved the real store off the intro screen by calling perform_action.
- *
- * PHX_KEY_FILE=/path/to/key.txt tsx scripts/ci-driver-live-agent.ts
- */
-
-import fs from 'fs';
-import { query } from '@anthropic-ai/claude-agent-sdk';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession } from '@lib/wizard-session';
-import { buildAgentEnv } from '@lib/agent/agent-interface';
-import { Program } from '@lib/programs/program-registry';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import {
- createWizardCiToolsServer,
- CI_TOOL_NAMES,
-} from '@e2e-harness/wizard-ci-tools';
-
-const GATEWAY_URL = 'https://gateway.us.posthog.com/wizard';
-const MODEL = 'claude-haiku-4-5-20251001';
-
-async function main() {
- const keyFile = process.env.PHX_KEY_FILE;
- if (!keyFile) throw new Error('Set PHX_KEY_FILE to the phx key path');
- const phxKey = fs.readFileSync(keyFile, 'utf8').trim();
-
- // Point the agent SDK at the PostHog gateway, phx key as bearer.
- process.env.ANTHROPIC_BASE_URL = GATEWAY_URL;
- process.env.ANTHROPIC_AUTH_TOKEN = phxKey;
- process.env.CLAUDE_CODE_OAUTH_TOKEN = phxKey;
- process.env.CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS = 'true';
-
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store));
- store.session = buildSession({ installDir: '/tmp/ci-live', ci: true });
- const driver = new WizardCiDriver(store);
- const server = await createWizardCiToolsServer(driver);
-
- process.stdout.write(
- `\nBefore: currentScreen=${
- driver.readState().currentScreen
- } setupConfirmed=${store.session.setupConfirmed}\n\n`,
- );
-
- const prompt =
- 'You are driving a PostHog wizard through its test control plane. ' +
- 'The wizard is on its intro screen. Your very first action must be to call ' +
- 'the perform_action tool with {"action":"confirm_setup"} (no other params) ' +
- 'to advance past it. Do that immediately, before anything else. ' +
- 'Then call read_state once and report the new currentScreen. Be terse.';
-
- const abort = new AbortController();
- const timer = setTimeout(() => abort.abort(), 220_000);
-
- const toolCalls: string[] = [];
- let finalText = '';
-
- // Streaming-input prompt. A plain string prompt closes stdin after turn 1,
- // which breaks every follow-up turn (the wizard hits the same SDK bug and
- // works around it the same way). Keep the generator open until the SDK
- // emits its `result` message so the session survives multi-turn tool use.
- let signalDone!: () => void;
- const resultReceived = new Promise((r) => {
- signalDone = r;
- });
- const promptStream = async function* () {
- yield {
- type: 'user' as const,
- session_id: '',
- message: { role: 'user' as const, content: prompt },
- parent_tool_use_id: null,
- };
- await resultReceived;
- };
-
- try {
- const response = query({
- prompt: promptStream(),
- options: {
- abortController: abort,
- model: MODEL,
- permissionMode: 'bypassPermissions',
- betas: ['context-1m-2025-08-07'],
- systemPrompt: { type: 'preset', preset: 'claude_code' },
- tools: { type: 'preset', preset: 'claude_code' },
- env: {
- ...process.env,
- // The user's Anthropic key (set in this shell) would override the
- // gateway bearer and 401 — unset it so ANTHROPIC_AUTH_TOKEN wins.
- ANTHROPIC_API_KEY: undefined,
- ANTHROPIC_BASE_URL: GATEWAY_URL,
- ANTHROPIC_AUTH_TOKEN: phxKey,
- CLAUDE_CODE_OAUTH_TOKEN: phxKey,
- ENABLE_TOOL_SEARCH: 'auto:0',
- MCP_CONNECTION_NONBLOCKING: '0',
- // The gateway expects PostHog's custom headers (bedrock fallback +
- // metadata) — the wizard sets these for every real run.
- ANTHROPIC_CUSTOM_HEADERS: buildAgentEnv({}, {}),
- },
- mcpServers: { [`wizard-ci-tools`]: server },
- allowedTools: [
- CI_TOOL_NAMES.readState,
- CI_TOOL_NAMES.listActions,
- CI_TOOL_NAMES.performAction,
- ],
- },
- } as never);
-
- for await (const msg of response as AsyncIterable) {
- if (msg.type === 'assistant') {
- for (const block of msg.message?.content ?? []) {
- if (block.type === 'tool_use') {
- toolCalls.push(block.name);
- process.stdout.write(` → tool_use: ${block.name}\n`);
- } else if (block.type === 'text' && block.text) {
- finalText = block.text;
- }
- }
- } else if (msg.type === 'result') {
- if (msg.result) finalText = msg.result;
- signalDone(); // close the prompt stream so the SDK can exit
- }
- // Stop as soon as the model has driven the store off the intro screen —
- // one successful tool-driven commit is the proof we're after.
- if (store.session.setupConfirmed) {
- abort.abort();
- break;
- }
- }
- } catch (e) {
- // A later-turn gateway error must not mask a commit that already landed —
- // we evaluate store state below regardless.
- process.stdout.write(
- ` (query ended: ${e instanceof Error ? e.message.split('\n')[0] : e})\n`,
- );
- } finally {
- signalDone();
- clearTimeout(timer);
- }
-
- const after = driver.readState();
- process.stdout.write(
- `\nAfter: currentScreen=${after.currentScreen} setupConfirmed=${store.session.setupConfirmed}\n`,
- );
- process.stdout.write(`Model said: ${finalText.slice(0, 200)}\n`);
- process.stdout.write(`Tool calls: ${toolCalls.join(', ') || '(none)'}\n\n`);
-
- const advanced = store.session.setupConfirmed === true;
- process.stdout.write(
- `${
- advanced
- ? '✓ LLM advanced the real store via wizard-ci-tools'
- : '✗ store did not advance'
- }\n\n`,
- );
- process.exit(advanced ? 0 : 1);
-}
-
-main().catch((e) => {
- process.stderr.write(`\nLIVE_FAIL: ${e?.stack ?? e}\n`);
- process.exit(1);
-});
diff --git a/scripts/record-demo.no-jest.ts b/scripts/record-demo.no-jest.ts
deleted file mode 100644
index b268c270..00000000
--- a/scripts/record-demo.no-jest.ts
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Produce a sample recording offline (no agent, no network) so you can try the
- * replayer. Walks the integration flow with the e2e profile, injecting the
- * external transitions a real run gets (health probe, auth, agent runPhase) and
- * some agent status/tasks, while a WizardRecorder captures each key moment.
- *
- * tsx scripts/record-demo.no-jest.ts # writes /tmp/wizard-demo.recording.json
- * tsx scripts/replay-e2e.no-jest.ts /tmp/wizard-demo.recording.json --step
- */
-import { writeFileSync } from 'fs';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession, RunPhase } from '@lib/wizard-session';
-import { Integration } from '@lib/constants';
-import { FRAMEWORK_REGISTRY } from '@lib/registry';
-import { WizardReadiness } from '@lib/health-checks/readiness';
-import { Program } from '@lib/programs/program-registry';
-import { ScreenId } from '@ui/tui/router';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import { decideE2eAction } from '@e2e-harness/e2e-profile';
-import { profileFor } from '@e2e-harness/profiles';
-import { WizardRecorder } from '@e2e-harness/recorder';
-
-const out = process.env.RECORDING_OUT ?? '/tmp/wizard-demo.recording.json';
-
-const store = new WizardStore(Program.PostHogIntegration);
-setUI(new InkUI(store));
-const session = buildSession({ installDir: '/tmp/demo-app', ci: true });
-session.integration = Integration.nextjs;
-session.frameworkConfig = FRAMEWORK_REGISTRY[Integration.nextjs];
-store.session = session;
-
-let clock = 0;
-const rec = new WizardRecorder(
- store,
- { program: 'posthog-integration', app: 'demo-nextjs' },
- () => (clock += 600),
-);
-rec.start();
-
-const driver = new WizardCiDriver(store);
-const profile = profileFor(Program.PostHogIntegration);
-
-for (let i = 0; i < 40; i++) {
- const state = driver.readState();
- const d = decideE2eAction(state, profile);
- if (d.action) driver.performAction(d.action.id, d.action.params ?? {});
-
- if (state.currentScreen === ScreenId.HealthCheck) {
- store.setReadinessResult({
- decision: WizardReadiness.Yes,
- health: {} as never,
- reasons: [],
- });
- } else if (state.currentScreen === ScreenId.Auth) {
- store.setCredentials({
- accessToken: 'phx_secret',
- projectApiKey: 'phc_demo',
- host: 'https://us.posthog.com',
- projectId: 1,
- });
- } else if (state.currentScreen === ScreenId.Run) {
- store.pushStatus('Installing posthog-js…');
- store.setTasks([
- { label: 'Install SDK', status: 'completed' as never, done: true },
- ]);
- store.pushStatus('Wiring instrumentation-client.ts…');
- store.setRunPhase(RunPhase.Completed);
- }
- if (d.done || store.session.skillsComplete) break;
-}
-rec.stop();
-writeFileSync(out, JSON.stringify(rec.getRecording(), null, 2));
-process.stdout.write(`recorded ${rec.frameCount} frames → ${out}\n`);
-process.stdout.write(
- `replay: npx tsx scripts/replay-e2e.no-jest.ts ${out} --step\n`,
-);
From 5f214b7c1cb5c06cd014a3c287bca08d3acc9642 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 17:35:23 -0400
Subject: [PATCH 17/38] docs(e2e-harness): add the agent exploration runbook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
EXPLORING-AS-AN-AGENT.md — a runbook for an agent that wants to run/drive/explore
the wizard headlessly: ask the user for a key file path + set env, then either a
full `wizard-ci --e2e` run or a hand-driven read_state→perform_action loop, with
renderFrame to snapshot the TUI for itself to view. Gives wizard-ci-tools its
documented use (agentic exploration). Recipe smoke-tested (intro → health-check,
renders the real screen). ARCHITECTURE.md points at it.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/ARCHITECTURE.md | 5 ++
e2e-harness/EXPLORING-AS-AN-AGENT.md | 130 +++++++++++++++++++++++++++
2 files changed, 135 insertions(+)
create mode 100644 e2e-harness/EXPLORING-AS-AN-AGENT.md
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index 9ca36af1..256a8ea3 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -6,6 +6,11 @@ it runs the WHOLE interactive flow headlessly via `wizard-ci-tools` and asserts
on structured state — not the classic `--ci` mode (LoggingUI, stdout-grep,
agent-only).
+> If you're an agent that just wants to **run and explore the wizard** (drive
+> it, view the screens, snapshot it), start with the runbook:
+> [`EXPLORING-AS-AN-AGENT.md`](EXPLORING-AS-AN-AGENT.md). This doc is the _how
+> it works_ underneath.
+
## The pieces
This whole harness lives in `e2e-harness/` at the repo root — deliberately OUT
diff --git a/e2e-harness/EXPLORING-AS-AN-AGENT.md b/e2e-harness/EXPLORING-AS-AN-AGENT.md
new file mode 100644
index 00000000..8334a58f
--- /dev/null
+++ b/e2e-harness/EXPLORING-AS-AN-AGENT.md
@@ -0,0 +1,130 @@
+# Driving & exploring the wizard as an agent
+
+A runbook for a future AI agent (you) that wants to **run the real wizard
+headlessly, drive its state, and snapshot the TUI to view it** — to explore or
+test the app with no terminal. It uses the control plane in this folder
+(`WizardCiDriver` + the `wizard-ci-tools` MCP server). For _how_ it works under
+the hood, read [`ARCHITECTURE.md`](ARCHITECTURE.md); this is the _how to do it_.
+
+## 0. Ask for the key, then set up
+
+**First, ask the user for the path to their PostHog key file** — e.g. "What's
+the absolute path to your phx key file?" — plus the project id and region if you
+don't have them. Then, in the shell you'll run from:
+
+```bash
+export POSTHOG_PERSONAL_API_KEY="$(cat )" # the phx key
+export POSTHOG_WIZARD_PROJECT_ID= # the team the key is scoped to
+export POSTHOG_REGION=us # or eu
+export WIZARD_PATH= # where e2e-harness/ lives
+```
+
+Rules: **never print or commit the key.** Always run against a **`/tmp` copy**
+of an app, never a real fixture. If you're inside a Claude Code session, the
+harness strips the host `CLAUDE_*`/`ANTHROPIC_*` env for the child so the
+spawned agent auths with the phx key (the `apiKeySource: none` → 401 trap).
+
+## 1. Full run, then view it (the easy path)
+
+From [wizard-workbench](https://github.com/PostHog/wizard-workbench):
+
+```bash
+pnpm wizard-ci --e2e # real agent, headless; writes a recording
+pnpm wizard-ci-snapshots # renders each key moment → .ans + report.html
+```
+
+To watch it back:
+`pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --step`, or just
+read the `.ans` frames / `report.html`. This is the whole flow, real agent, no
+decisions for you to make.
+
+## 2. Drive it yourself (the control plane)
+
+To step the flow and **decide each screen**, use the three primitives —
+`read_state`, `list_actions`, `perform_action`. They're exposed as the
+`wizard-ci-tools` MCP server (`createWizardCiToolsServer`) for a connected
+driver model; the same primitives are `WizardCiDriver` methods you can call
+directly from a tsx script. The loop is always:
+
+```
+read_state → look at currentScreen + the legal actions → perform_action(one of them) → read_state → …
+```
+
+Recipe — write it to a scratch file **inside this repo** so the `@lib`/`@ui`/
+`@e2e-harness` aliases resolve (a `/tmp` file won't see the tsconfig). Name it
+`scripts/explore.no-jest.ts` (the `.no-jest` suffix keeps Jest from picking it
+up), run `npx tsx scripts/explore.no-jest.ts` from `$WIZARD_PATH`, then delete
+it. It drives the **UI** screens offline (no agent/auth) and renders each one so
+you can see it:
+
+```ts
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession } from '@lib/wizard-session';
+import { Program } from '@lib/programs/program-registry';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
+import { WizardRecorder } from '@e2e-harness/recorder';
+import { renderFrame } from '@e2e-harness/replay';
+
+async function main() {
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store));
+ store.session = buildSession({ installDir: '/tmp/app-copy', ci: true });
+ await store.runReadyHooks(); // real framework detection
+
+ const rec = new WizardRecorder(store, { program: 'posthog-integration' });
+ rec.start();
+ const driver = new WizardCiDriver(store);
+
+ // LOOK: where am I, and what can I commit?
+ console.log(
+ driver.readState().currentScreen,
+ driver.listActions().map((a) => a.id),
+ );
+
+ // ACT: name an action from list_actions (not a keystroke)
+ driver.performAction('confirm_setup');
+ console.log(
+ driver.readState().currentScreen,
+ driver.listActions().map((a) => a.id),
+ );
+ // …repeat: read_state → decide → perform_action…
+
+ // VIEW: render every captured frame as the real TUI (ANSI) so you can see it
+ rec.stop();
+ for (const f of rec.getRecording().frames) {
+ console.log(
+ `\n=== ${f.screen} ===\n` + renderFrame(f, Program.PostHogIntegration),
+ );
+ }
+}
+main();
+```
+
+`auth` and `run` are _external_ steps (the runner sets credentials, the agent
+sets run state) — for those, drive the full `--e2e` path in §1, which runs the
+real agent and records it for you.
+
+## 3. Snapshot for yourself to view
+
+Two ways to "see" a screen as an agent:
+
+- **`renderFrame(frame, program)`** → the real Ink screen as an ANSI string you
+ can print and read (used above). Strip ANSI if you want plain text.
+- **The recording JSON** — each frame already carries `screen`, `tasks`,
+ `statusMessages`, and the (secret-redacted) `session`, so you can assert on
+ what happened without rendering.
+
+The access token is redacted in both `read_state` and recordings, so anything
+you capture is safe to share.
+
+## Key facts
+
+- **State → screen.** You never navigate; you flip a session flag (via an
+ action's store setter) and the router re-derives the active screen. Name
+ actions, not keys.
+- **Secrets stay out.** `read_state` reduces credentials to `hasCredentials` +
+ `projectId`; the token is never serialized.
+- **None of this ships.** The harness lives in `e2e-harness/`, out of `src/`,
+ and is absent from the production bundle.
From 5491e27cba47e7b2764459ff198e5ee54bdd9cd9 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 17:50:24 -0400
Subject: [PATCH 18/38] docs: move agent-exploration to wizard README, trim
comments to current behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- README: add "Explore with an agent" under Running locally → Testing (was wrongly
placed in the workbench README).
- scripts/README: drop the cross-PR pointer to the #703 repro scripts.
- Trim header/inline comments across the harness + scripts to concise descriptions
of what the code does now — no history, no change-rationale.
Co-Authored-By: Claude Opus 4.8
---
README.md | 280 +++++++++++++++++-----------
e2e-harness/ARCHITECTURE.md | 4 +-
e2e-harness/e2e-profile.ts | 13 +-
e2e-harness/profiles.ts | 8 +-
scripts/README.md | 6 -
scripts/render-snapshots.no-jest.ts | 10 +-
6 files changed, 178 insertions(+), 143 deletions(-)
diff --git a/README.md b/README.md
index f48ae2a3..cbb99f3e 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
-
-> have any feedback, please drop an email to **[wizard@posthog.com](mailto:wizard@posthog.com)**.
+> have any feedback, please drop an email to
+> **[wizard@posthog.com](mailto:wizard@posthog.com)**.
PostHog wizard ✨
@@ -19,22 +19,36 @@ To use the wizard, you can run it directly using:
npx @posthog/wizard
```
-Currently the wizard can be used for over 16+ frameworks for frontend, backend, and mobile applications. If you have other integrations you would like the wizard to
-support, please open a [GitHub issue](https://github.com/posthog/wizard/issues)!
+Currently the wizard can be used for over 16+ frameworks for frontend, backend,
+and mobile applications. If you have other integrations you would like the
+wizard to support, please open a
+[GitHub issue](https://github.com/posthog/wizard/issues)!
-Visit our [docs](https://posthog.com/docs/ai-engineering/ai-wizard) to learn more.
+Visit our [docs](https://posthog.com/docs/ai-engineering/ai-wizard) to learn
+more.
## Privacy & data usage
-The wizard uses **Anthropic Claude** (via PostHog's LLM gateway) to read your project's source files and integrate PostHog. A few things worth knowing up front:
+The wizard uses **Anthropic Claude** (via PostHog's LLM gateway) to read your
+project's source files and integrate PostHog. A few things worth knowing up
+front:
- **Source files** are sent to Anthropic as part of the agent's context.
-- **`.env*` files and secrets** stay on your machine. The wizard's security scanner blocks anything it identifies as a secret from being read by the agent.
-- **Telemetry** (run metadata — phase, task list, planned events) is sent to PostHog by default. Pass `--no-telemetry` (or set `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
-- **AI opt-in**: the wizard honors your PostHog organization's `is_ai_data_processing_approved` setting (the same toggle that gates Max). If your org has not opted in, the wizard explains how to enable it and exits without sending source to Anthropic.
-- **Prefer your own AI?** The wizard's integration knowledge ships as a context-mill skill you can download and run inside your own agent.
-
-The wizard's "Privacy & data usage" menu (intro screen) and the `[I]` shortcut on the auth screen surface the same information in-terminal.
+- **`.env*` files and secrets** stay on your machine. The wizard's security
+ scanner blocks anything it identifies as a secret from being read by the
+ agent.
+- **Telemetry** (run metadata — phase, task list, planned events) is sent to
+ PostHog by default. Pass `--no-telemetry` (or set
+ `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
+- **AI opt-in**: the wizard honors your PostHog organization's
+ `is_ai_data_processing_approved` setting (the same toggle that gates Max). If
+ your org has not opted in, the wizard explains how to enable it and exits
+ without sending source to Anthropic.
+- **Prefer your own AI?** The wizard's integration knowledge ships as a
+ context-mill skill you can download and run inside your own agent.
+
+The wizard's "Privacy & data usage" menu (intro screen) and the `[I]` shortcut
+on the auth screen surface the same information in-terminal.
## MCP Commands
@@ -52,8 +66,8 @@ npx @posthog/wizard mcp remove
## Audit
Audit an existing PostHog integration for correctness and best practices. The
-`audit` command is a **family**. With no subcommand it runs the **events**
-audit (the default); pass a subcommand to run a specific one:
+`audit` command is a **family**. With no subcommand it runs the **events** audit
+(the default); pass a subcommand to run a specific one:
```bash
# Runs the events audit (the default) — no subcommand needed
@@ -72,12 +86,12 @@ npx @posthog/wizard audit web-analytics # web analytics setup
Most audit subcommands resolve at runtime from the published skill registry, so
new audits appear without a wizard release (`web-analytics` is wizard-native).
-> **`audit ` chooses an audit area — it does not take a skill name.**
-> The audit subcommands above *are* context-mill skills promoted to commands (via
-> a `cli: role: command` block); [`wizard skill `](#run-a-single-skill)
-> runs a skill that hasn't been promoted. Same machinery, two surfaces.
-> (`wizard audit --help` still labels the positional `[skill]` — read it as "pick
-> a subcommand.")
+> **`audit ` chooses an audit area — it does not take a skill
+> name.** The audit subcommands above _are_ context-mill skills promoted to
+> commands (via a `cli: role: command` block);
+> [`wizard skill `](#run-a-single-skill) runs a skill that hasn't
+> been promoted. Same machinery, two surfaces. (`wizard audit --help` still
+> labels the positional `[skill]` — read it as "pick a subcommand.")
## Run a single skill
@@ -115,13 +129,13 @@ OAuth sources open the PostHog app's new-source flow in your browser.
## Headless signup + install (agents / CI)
-> ⚠️ `--ci` is **not currently supported in published builds** (see [CI Mode](#ci-mode)).
-> This flow works in development builds only.
+> ⚠️ `--ci` is **not currently supported in published builds** (see
+> [CI Mode](#ci-mode)). This flow works in development builds only.
-For a fully non-interactive first-run (no existing PostHog account, no TTY,
-no browser), combine `--ci --signup --email`. The wizard provisions a new
-account, uses the returned personal API key to run the normal CI install,
-and wires PostHog into the project at `--install-dir`:
+For a fully non-interactive first-run (no existing PostHog account, no TTY, no
+browser), combine `--ci --signup --email`. The wizard provisions a new account,
+uses the returned personal API key to run the normal CI install, and wires
+PostHog into the project at `--install-dir`:
```bash
npx @posthog/wizard --ci --signup \
@@ -147,38 +161,37 @@ npx @posthog/wizard provision --email user@example.com --region eu --json
```
Success prints the full `ProvisioningResult` (`projectApiKey`, `host`,
-`projectId`, `accountId`, `accessToken`, `refreshToken`, and
-`personalApiKey` if present). Failure exits 1; in `--json` mode the error
-is emitted to stderr as `{"error":"...","code":"..."}`, with `code` set to
-`email_exists` when the address is already registered.
+`projectId`, `accountId`, `accessToken`, `refreshToken`, and `personalApiKey` if
+present). Failure exits 1; in `--json` mode the error is emitted to stderr as
+`{"error":"...","code":"..."}`, with `code` set to `email_exists` when the
+address is already registered.
-> ⚠️ **Output contains live credentials.** Pipe it into a secrets store —
-> do not let it be captured by shared CI logs. Mask the step output or
-> redirect stdout to a file your job reads and discards.
+> ⚠️ **Output contains live credentials.** Pipe it into a secrets store — do not
+> let it be captured by shared CI logs. Mask the step output or redirect stdout
+> to a file your job reads and discards.
# Options
The following CLI arguments are available:
-| Option | Description | Type | Default | Choices | Environment Variable |
-| ----------------- | ---------------------------------------------------------------- | ------- | ------- | ---------------------------------------------------- | ------------------------------ |
-| `--help` | Show help | boolean | | | |
-| `--version` | Show version number | boolean | | | |
-| `--debug` | Enable verbose logging | boolean | `false` | | `POSTHOG_WIZARD_DEBUG` |
-| `--signup` | Create a new PostHog account during setup | boolean | `false` | | `POSTHOG_WIZARD_SIGNUP` |
-| `--install-dir` | Directory to install PostHog in | string | | | `POSTHOG_WIZARD_INSTALL_DIR` |
-| `--ci` | Enable CI mode for non-interactive execution | boolean | `false` | | `POSTHOG_WIZARD_CI` |
-| `--api-key` | PostHog personal API key (phx_xxx) for authentication | string | | | `POSTHOG_WIZARD_API_KEY` |
-| `--no-telemetry` | Disable wizard run-state telemetry | boolean | `false` | | `POSTHOG_WIZARD_NO_TELEMETRY` |
-
+| Option | Description | Type | Default | Choices | Environment Variable |
+| ---------------- | ----------------------------------------------------- | ------- | ------- | ------- | ----------------------------- |
+| `--help` | Show help | boolean | | | |
+| `--version` | Show version number | boolean | | | |
+| `--debug` | Enable verbose logging | boolean | `false` | | `POSTHOG_WIZARD_DEBUG` |
+| `--signup` | Create a new PostHog account during setup | boolean | `false` | | `POSTHOG_WIZARD_SIGNUP` |
+| `--install-dir` | Directory to install PostHog in | string | | | `POSTHOG_WIZARD_INSTALL_DIR` |
+| `--ci` | Enable CI mode for non-interactive execution | boolean | `false` | | `POSTHOG_WIZARD_CI` |
+| `--api-key` | PostHog personal API key (phx_xxx) for authentication | string | | | `POSTHOG_WIZARD_API_KEY` |
+| `--no-telemetry` | Disable wizard run-state telemetry | boolean | `false` | | `POSTHOG_WIZARD_NO_TELEMETRY` |
# CI Mode
> ⚠️ **CI mode is not currently supported in published builds.** PostHog's LLM
-> gateway doesn't yet grant the scopes the wizard needs to personal API keys
-> for most users, so non-interactive `--ci` runs fail at the gateway. The flag
-> is disabled in the published package and exits with an error — run the wizard
-> in an interactive terminal instead (`npx @posthog/wizard`). The notes below
+> gateway doesn't yet grant the scopes the wizard needs to personal API keys for
+> most users, so non-interactive `--ci` runs fail at the gateway. The flag is
+> disabled in the published package and exits with an error — run the wizard in
+> an interactive terminal instead (`npx @posthog/wizard`). The notes below
> describe CI mode as it works in development builds.
Run the wizard non-interactive executions with `--ci`:
@@ -199,8 +212,10 @@ The CLI args override environment variables in CI mode.
### Required Flags for CI Mode
-- `--api-key`: Personal API key (`phx_xxx`) from your [PostHog settings](https://app.posthog.com/settings/user-api-keys)
-- `--install-dir`: Directory to install PostHog in (e.g., `.` for current directory)
+- `--api-key`: Personal API key (`phx_xxx`) from your
+ [PostHog settings](https://app.posthog.com/settings/user-api-keys)
+- `--install-dir`: Directory to install PostHog in (e.g., `.` for current
+ directory)
### Required API Key Scopes
@@ -214,8 +229,8 @@ When creating your personal API key, ensure it has the following scopes enabled:
### OAuth app scope ceiling
-The wizard's OAuth app on the PostHog side caps the scopes its tokens may
-carry (`OAuthApplication.scopes`). Any scope requested in this repo (see
+The wizard's OAuth app on the PostHog side caps the scopes its tokens may carry
+(`OAuthApplication.scopes`). Any scope requested in this repo (see
`src/lib/oauth/program-scopes.ts`) must be present in that list. Current
ceiling, for bookkeeping:
@@ -228,18 +243,18 @@ user:read,project:read,llm_gateway:read,dashboard:read,dashboard:write,insight:r
The CLI was overhauled to consolidate commands into a smaller, extensible
surface. If you used an older command, here's where it went:
-| Old command | New command | What changed |
-|---|---|---|
-| `wizard integrate` | `wizard` (default flow) | Command removed; the default flow runs the integration |
-| `wizard events-audit` | `wizard audit events` | Now an `audit`-family subcommand |
-| `wizard audit` (single audit) | `wizard audit ` | Now a family; see [Audit](#audit) for the subcommands |
-| `wizard audit-3000` | *removed* | Retired |
-| `wizard revenue` | `wizard revenue-analytics` | Renamed (old `revenue` removed) |
-| `wizard upload-sourcemaps` | `wizard upload-source-maps` | Renamed; `upload-sourcemaps` still works as an alias |
-
-> **Commands vs. programs:** `integrate` was the *command*; the program behind it
-> is `posthog-integration`, which still exists and now powers the default flow.
-> Other commands depend on it via `requires: ['posthog-integration']`. The
+| Old command | New command | What changed |
+| ----------------------------- | --------------------------- | ------------------------------------------------------ |
+| `wizard integrate` | `wizard` (default flow) | Command removed; the default flow runs the integration |
+| `wizard events-audit` | `wizard audit events` | Now an `audit`-family subcommand |
+| `wizard audit` (single audit) | `wizard audit ` | Now a family; see [Audit](#audit) for the subcommands |
+| `wizard audit-3000` | _removed_ | Retired |
+| `wizard revenue` | `wizard revenue-analytics` | Renamed (old `revenue` removed) |
+| `wizard upload-sourcemaps` | `wizard upload-source-maps` | Renamed; `upload-sourcemaps` still works as an alias |
+
+> **Commands vs. programs:** `integrate` was the _command_; the program behind
+> it is `posthog-integration`, which still exists and now powers the default
+> flow. Other commands depend on it via `requires: ['posthog-integration']`. The
> program id is internal — it was never a command you typed.
# Steal this code
@@ -275,8 +290,8 @@ When the user authenticates, the wizard also streams live run state — current
phase, task list, planned events — to `POST /api/projects/{id}/wizard/sessions/`
so the PostHog web app can render real-time progress. Updates are debounced
(250ms) with phase changes flushed immediately; failures fall back silently to
-the wizard's debug log without disturbing the TUI. Pass `--no-telemetry` (or
-set `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
+the wizard's debug log without disturbing the TUI. Pass `--no-telemetry` (or set
+`POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
## Leave rules behind
@@ -311,10 +326,9 @@ users of the wizard, no training delays or other ambiguity.
## Keep secrets out of the LLM
-The wizard somtimes needs to move a secret. The agent
-orchestrates that journey, but the raw value should _never_ enter the LLM
-conversation, where it would be sent to the model provider, written to
-transcripts, and captured in logs.
+The wizard somtimes needs to move a secret. The agent orchestrates that journey,
+but the raw value should _never_ enter the LLM conversation, where it would be
+sent to the model provider, written to transcripts, and captured in logs.
`src/lib/secret-vault.ts` is a small, reusable pattern for exactly this. It's a
session-scoped, in-memory vault: a tool that handles a secret calls `put()` to
@@ -337,42 +351,50 @@ drive the work end to end, but the only thing it ever sees is an opaque handle.
## Build system
-Built with [tsdown](https://tsdown.dev/) (Rolldown). `pnpm build` bundles `bin.ts` into ESM chunks in `dist/`, inlining all local source and keeping npm dependencies external.
+Built with [tsdown](https://tsdown.dev/) (Rolldown). `pnpm build` bundles
+`bin.ts` into ESM chunks in `dist/`, inlining all local source and keeping npm
+dependencies external.
### Environment variables
-**Build-time (locked).** `NODE_ENV` is replaced with `"production"` at compile time. It cannot be overridden at runtime. All URLs, OAuth client IDs, and dev-mode code paths resolve to their production values unconditionally.
+**Build-time (locked).** `NODE_ENV` is replaced with `"production"` at compile
+time. It cannot be overridden at runtime. All URLs, OAuth client IDs, and
+dev-mode code paths resolve to their production values unconditionally.
-To add a new build-time constant, add it to `env` in `tsdown.config.ts` and export it from `src/env.ts`.
+To add a new build-time constant, add it to `env` in `tsdown.config.ts` and
+export it from `src/env.ts`.
-**Runtime (allowlisted).** Runtime env reads go through `runtimeEnv()` in `src/env.ts`, which only accepts keys in the `RuntimeEnvKey` union:
+**Runtime (allowlisted).** Runtime env reads go through `runtimeEnv()` in
+`src/env.ts`, which only accepts keys in the `RuntimeEnvKey` union:
-| Variable | Purpose |
-|---|---|
-| `POSTHOG_WIZARD_BENCHMARK_CONFIG` | Path to benchmark config file |
-| `POSTHOG_WIZARD_BENCHMARK_FILE` | Output path for benchmark results |
-| `POSTHOG_WIZARD_LOG_DIR` | Log directory override |
-| `POSTHOG_WIZARD_DEBUG` / `DEBUG` | Enable debug output |
-| `MCP_URL` | Override MCP server URL |
-| `POSTHOG_API_KEY` | API key for MCP subprocess auth |
-| `TERM`, `TERM_PROGRAM`, `CI`, etc. | Terminal/platform detection |
-| `APPDATA`, `XDG_CONFIG_HOME` | Platform path resolution |
+| Variable | Purpose |
+| ---------------------------------- | --------------------------------- |
+| `POSTHOG_WIZARD_BENCHMARK_CONFIG` | Path to benchmark config file |
+| `POSTHOG_WIZARD_BENCHMARK_FILE` | Output path for benchmark results |
+| `POSTHOG_WIZARD_LOG_DIR` | Log directory override |
+| `POSTHOG_WIZARD_DEBUG` / `DEBUG` | Enable debug output |
+| `MCP_URL` | Override MCP server URL |
+| `POSTHOG_API_KEY` | API key for MCP subprocess auth |
+| `TERM`, `TERM_PROGRAM`, `CI`, etc. | Terminal/platform detection |
+| `APPDATA`, `XDG_CONFIG_HOME` | Platform path resolution |
To add a new runtime env var, add its key to `RuntimeEnvKey` in `src/env.ts`.
-**Direct `process.env` access** is only used for subprocess environment writes (e.g. `agent-interface.ts` setting `ANTHROPIC_BASE_URL`), vendored code, and tests.
+**Direct `process.env` access** is only used for subprocess environment writes
+(e.g. `agent-interface.ts` setting `ANTHROPIC_BASE_URL`), vendored code, and
+tests.
### Import aliases
Path aliases defined in `tsconfig.build.json`, resolved by tsdown:
-| Alias | Maps to |
-|---|---|
-| `@env` | `src/env.ts` |
-| `@lib/*` | `src/lib/*` |
-| `@utils/*` | `src/utils/*` |
-| `@ui/*` | `src/ui/*` |
-| `@steps/*` | `src/steps/*` |
+| Alias | Maps to |
+| --------------- | ------------------ |
+| `@env` | `src/env.ts` |
+| `@lib/*` | `src/lib/*` |
+| `@utils/*` | `src/utils/*` |
+| `@ui/*` | `src/ui/*` |
+| `@steps/*` | `src/steps/*` |
| `@frameworks/*` | `src/frameworks/*` |
## Running locally
@@ -389,7 +411,8 @@ pnpm try --install-dir=[a path]
pnpm run dev
```
-This builds, links globally, and watches for changes. Leave it running - any `.ts` file changes will auto-rebuild. Then from any project:
+This builds, links globally, and watches for changes. Leave it running - any
+`.ts` file changes will auto-rebuild. Then from any project:
```bash
wizard --integration=nextjs
@@ -398,7 +421,7 @@ wizard --integration=nextjs
wizard --integration=nextjs --local-mcp
```
-## Testing
+### Testing
To run unit tests, run:
@@ -415,6 +438,31 @@ bin/test-e2e
E2E tests are a bit more complicated to create and adjust due to to their mocked
LLM calls. See the `e2e-tests/README.md` for more information.
+#### Explore with an agent
+
+You can hand the wizard to an AI agent and have it **run, drive, and explore the
+wizard itself** — against any app, headlessly, snapshotting the TUI so it can
+see what happened. The runbook is
+[`e2e-harness/EXPLORING-AS-AN-AGENT.md`](e2e-harness/EXPLORING-AS-AN-AGENT.md):
+it covers driving the flow through the `wizard-ci-tools` control plane
+(`read_state` / `list_actions` / `perform_action`), capturing snapshots with
+`renderFrame`, and the env a run needs.
+
+Point an agent at it with a prompt like — here, exploring against
+[open-saas](https://github.com/wasp-lang/open-saas):
+
+> Explore the PostHog wizard against a real app. Read
+> `e2e-harness/EXPLORING-AS-AN-AGENT.md` — your runbook for driving the wizard
+> headlessly, capturing snapshots, and the env you'll need. Ask me for my phx
+> key file path and set up per the runbook. Then clone
+> `https://github.com/wasp-lang/open-saas` into a throwaway `/tmp` copy, work
+> out how to build it, and run the wizard against it — driving the flow,
+> snapshotting each key moment, and rendering the screens back so I can see
+> them. Then tell me what the wizard did: which screens it walked, what it
+> changed, and anything that broke.
+
+The agent works out how to build and run the target itself — that's the point.
+
## Publishing your tool
To make your version of a tool usable with a one-line `npx` command:
@@ -426,26 +474,26 @@ To make your version of a tool usable with a one-line `npx` command:
# Health checks
-`src/lib/health-checks/` checks external status pages and PostHog-owned
-services before the wizard runs to decide whether it can proceed. The entry
-point is `evaluateWizardReadiness()`, which returns one of three values:
+`src/lib/health-checks/` checks external status pages and PostHog-owned services
+before the wizard runs to decide whether it can proceed. The entry point is
+`evaluateWizardReadiness()`, which returns one of three values:
-| Decision | Meaning |
-| ------------------- | --------------------------------------------------------------- |
-| `yes` | All services healthy — proceed normally. |
-| `yes_with_warnings` | Some services degraded but no critical dependency is down. |
-| `no` | A critical dependency is down or degraded — do not run. |
+| Decision | Meaning |
+| ------------------- | ---------------------------------------------------------- |
+| `yes` | All services healthy — proceed normally. |
+| `yes_with_warnings` | Some services degraded but no critical dependency is down. |
+| `no` | A critical dependency is down or degraded — do not run. |
### Module layout
-| File | Responsibility |
-| --- | --- |
-| `types.ts` | Enums, interfaces (`ServiceHealthStatus`, `AllServicesHealth`, etc.) |
+| File | Responsibility |
+| --------------- | ------------------------------------------------------------------------------------- |
+| `types.ts` | Enums, interfaces (`ServiceHealthStatus`, `AllServicesHealth`, etc.) |
| `statuspage.ts` | Statuspage.io v2 API helpers + checks for Anthropic, PostHog, GitHub, npm, Cloudflare |
-| `endpoints.ts` | Direct endpoint checks for LLM Gateway (`/_liveness`) and MCP (`/`) |
-| `readiness.ts` | `checkAllExternalServices`, `evaluateWizardReadiness`, readiness config |
-| `index.ts` | Barrel re-export |
-| `testme.md` | Test running instructions and endpoint reference |
+| `endpoints.ts` | Direct endpoint checks for LLM Gateway (`/_liveness`) and MCP (`/`) |
+| `readiness.ts` | `checkAllExternalServices`, `evaluateWizardReadiness`, readiness config |
+| `index.ts` | Barrel re-export |
+| `testme.md` | Test running instructions and endpoint reference |
## What blocks a run
@@ -466,14 +514,19 @@ degradedBlocksRun: ['anthropic'],
## Smoke test helper (`scripts/smoke-test-ci.sh`)
-This repo includes a helper script to run a full end‑to‑end smoke test of the wizard packaged in a tarball against a real app from [`posthog/wizard-workbench`](https://github.com/PostHog/wizard-workbench). This will catch certain packaging issues that might not be caught by other tests.
+This repo includes a helper script to run a full end‑to‑end smoke test of the
+wizard packaged in a tarball against a real app from
+[`posthog/wizard-workbench`](https://github.com/PostHog/wizard-workbench). This
+will catch certain packaging issues that might not be caught by other tests.
**Prerequisites**
- Point to a `wizard-workbench` checkout either by:
- Setting `WIZARD_WORKBENCH_ROOT=/absolute/path/to/wizard-workbench`, or
- - Cloning `wizard-workbench` next to this repo (so it lives at `../wizard-workbench`).
-- Set `POSTHOG_PERSONAL_API_KEY` either in your shell or in `../wizard-workbench/.env`.
+ - Cloning `wizard-workbench` next to this repo (so it lives at
+ `../wizard-workbench`).
+- Set `POSTHOG_PERSONAL_API_KEY` either in your shell or in
+ `../wizard-workbench/.env`.
- (Optional) Set `POSTHOG_PROJECT_ID` to target a specific PostHog project.
**Usage**
@@ -501,4 +554,5 @@ The script will:
- Copy the selected app into a temp directory
- Install dependencies for the app
- Install the packed wizard tarball into an isolated temp project
-- Run `wizard` in `--ci` mode against the copied app and perform basic post‑install checks
+- Run `wizard` in `--ci` mode against the copied app and perform basic
+ post‑install checks
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index 256a8ea3..16f0b984 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -91,9 +91,7 @@ program e2e-drivable, add its profile to `profiles.ts`.
The flow is **snapshot-tested** offline (no agent, deterministic):
`__tests__/e2e-flow-snapshot.test.ts` golden-checks the (screen → decision)
-trace. Update with `jest -u` after an intentional flow/profile change. This is
-the structured-state analog of the TUI ANSI screenshots in
-`scripts/__screenshots__/`.
+trace. Update with `jest -u` after an intentional flow/profile change.
## Record & replay
diff --git a/e2e-harness/e2e-profile.ts b/e2e-harness/e2e-profile.ts
index f1f3a7aa..dda16295 100644
--- a/e2e-harness/e2e-profile.ts
+++ b/e2e-harness/e2e-profile.ts
@@ -1,13 +1,10 @@
/**
- * WizardE2eProfile — a program's declarative e2e "test definition": the
- * UI choices a headless e2e run should make at each decision point.
+ * WizardE2eProfile — a program's declarative e2e "test definition": the UI
+ * choices a headless e2e run makes at each decision point.
*
- * The *choices* are product knowledge about a program's flow, but they live in
- * the harness ({@link ./profiles}, keyed by program id) rather than on the
- * program config — so none of this e2e machinery reaches the wizard's
- * production source. The harness is generic: it reads a profile and asks
- * {@link decideE2eAction} what to commit on the current screen. Add a program's
- * profile to {@link ./profiles} to make it e2e-drivable.
+ * Per-program choices live in {@link ./profiles}, keyed by program id.
+ * {@link decideE2eAction} maps the current screen + a profile to the commit to
+ * make. Add a program's profile to {@link ./profiles} to make it e2e-drivable.
*/
import { ScreenId, Overlay, type ScreenName } from '@ui/tui/router';
diff --git a/e2e-harness/profiles.ts b/e2e-harness/profiles.ts
index 9cc8e301..fd094ccf 100644
--- a/e2e-harness/profiles.ts
+++ b/e2e-harness/profiles.ts
@@ -2,12 +2,10 @@
* Per-program e2e profiles — the UI choices a headless run makes driving each
* program's flow.
*
- * Each program declares its test path as a readable JSON next to the program
+ * Each program declares its test path as JSON next to it
* (`src/lib/programs//test/e2e.json`): a `profile` (the options the run
- * auto-takes) plus a documented `path`. We load the `profile` here and map it by
- * program id. Those JSONs are *data*, imported only by this harness — never by
- * prod code — so they don't reach the wizard's production source or its bundle.
- * Look one up with {@link profileFor}.
+ * auto-takes) plus a documented `path`. {@link profileFor} loads the `profile`
+ * and maps it by program id.
*/
import { Program, type ProgramId } from '@lib/programs/program-registry';
diff --git a/scripts/README.md b/scripts/README.md
index 619a8343..a3b14dad 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -26,9 +26,3 @@ none of it ships in prod. `WizardCiDriver` (read/act over the store), the
screen→action registry, the `wizard-ci-tools` MCP server, the e2e profiles, and
the recorder/replay. See [`ARCHITECTURE.md`](../e2e-harness/ARCHITECTURE.md) for
how an agent drives these (env strip, scoped project id, gotchas).
-
-> **Security-leak repro scripts** (`relay-prod.no-jest.ts`,
-> `precedence.no-jest.ts`) that reproduce the `ANTHROPIC_BASE_URL`
-> settings-override gateway leak live on the fix PR
-> ([PostHog/wizard#703](https://github.com/PostHog/wizard/pull/703)), documented
-> in its description + comments.
diff --git a/scripts/render-snapshots.no-jest.ts b/scripts/render-snapshots.no-jest.ts
index 6706cc8d..3e0389c0 100644
--- a/scripts/render-snapshots.no-jest.ts
+++ b/scripts/render-snapshots.no-jest.ts
@@ -1,13 +1,7 @@
/**
* Render a recording to per-frame TUI snapshots — one `.ans` file per key
- * moment, the REAL Ink screen rendered to ANSI (via replay's renderFrame, which
- * needs real ink, hence tsx not jest).
- *
- * These are the snapshots the workbench's visual-comparison flow diffs against a
- * committed baseline. A recording comes from a real `--e2e` run, so the
- * snapshots are what the user actually saw; run-to-run differences (e.g. the
- * agent enqueuing a different task) show up in the side-by-side for a human to
- * review.
+ * moment, the real Ink screen rendered to ANSI via replay's renderFrame (needs
+ * real ink, hence tsx not jest). Feeds the workbench visual-comparison flow.
*
* tsx scripts/render-snapshots.no-jest.ts
*/
From a919c3d4e2963ade255dd6556deb752c906a35a0 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 17:54:42 -0400
Subject: [PATCH 19/38] feat(skills): promote the agent-exploration runbook to
a skill
Move e2e-harness/EXPLORING-AS-AN-AGENT.md into .claude/skills/exploring-the-wizard/
so an agent auto-discovers it. Repoint the README + ARCHITECTURE links and list it
in AGENTS.md. ARCHITECTURE.md stays co-located as the how-it-works reference.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/EXPLORING-AS-AN-AGENT.md | 130 ---------------------------
1 file changed, 130 deletions(-)
delete mode 100644 e2e-harness/EXPLORING-AS-AN-AGENT.md
diff --git a/e2e-harness/EXPLORING-AS-AN-AGENT.md b/e2e-harness/EXPLORING-AS-AN-AGENT.md
deleted file mode 100644
index 8334a58f..00000000
--- a/e2e-harness/EXPLORING-AS-AN-AGENT.md
+++ /dev/null
@@ -1,130 +0,0 @@
-# Driving & exploring the wizard as an agent
-
-A runbook for a future AI agent (you) that wants to **run the real wizard
-headlessly, drive its state, and snapshot the TUI to view it** — to explore or
-test the app with no terminal. It uses the control plane in this folder
-(`WizardCiDriver` + the `wizard-ci-tools` MCP server). For _how_ it works under
-the hood, read [`ARCHITECTURE.md`](ARCHITECTURE.md); this is the _how to do it_.
-
-## 0. Ask for the key, then set up
-
-**First, ask the user for the path to their PostHog key file** — e.g. "What's
-the absolute path to your phx key file?" — plus the project id and region if you
-don't have them. Then, in the shell you'll run from:
-
-```bash
-export POSTHOG_PERSONAL_API_KEY="$(cat )" # the phx key
-export POSTHOG_WIZARD_PROJECT_ID= # the team the key is scoped to
-export POSTHOG_REGION=us # or eu
-export WIZARD_PATH= # where e2e-harness/ lives
-```
-
-Rules: **never print or commit the key.** Always run against a **`/tmp` copy**
-of an app, never a real fixture. If you're inside a Claude Code session, the
-harness strips the host `CLAUDE_*`/`ANTHROPIC_*` env for the child so the
-spawned agent auths with the phx key (the `apiKeySource: none` → 401 trap).
-
-## 1. Full run, then view it (the easy path)
-
-From [wizard-workbench](https://github.com/PostHog/wizard-workbench):
-
-```bash
-pnpm wizard-ci --e2e # real agent, headless; writes a recording
-pnpm wizard-ci-snapshots # renders each key moment → .ans + report.html
-```
-
-To watch it back:
-`pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --step`, or just
-read the `.ans` frames / `report.html`. This is the whole flow, real agent, no
-decisions for you to make.
-
-## 2. Drive it yourself (the control plane)
-
-To step the flow and **decide each screen**, use the three primitives —
-`read_state`, `list_actions`, `perform_action`. They're exposed as the
-`wizard-ci-tools` MCP server (`createWizardCiToolsServer`) for a connected
-driver model; the same primitives are `WizardCiDriver` methods you can call
-directly from a tsx script. The loop is always:
-
-```
-read_state → look at currentScreen + the legal actions → perform_action(one of them) → read_state → …
-```
-
-Recipe — write it to a scratch file **inside this repo** so the `@lib`/`@ui`/
-`@e2e-harness` aliases resolve (a `/tmp` file won't see the tsconfig). Name it
-`scripts/explore.no-jest.ts` (the `.no-jest` suffix keeps Jest from picking it
-up), run `npx tsx scripts/explore.no-jest.ts` from `$WIZARD_PATH`, then delete
-it. It drives the **UI** screens offline (no agent/auth) and renders each one so
-you can see it:
-
-```ts
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession } from '@lib/wizard-session';
-import { Program } from '@lib/programs/program-registry';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import { WizardRecorder } from '@e2e-harness/recorder';
-import { renderFrame } from '@e2e-harness/replay';
-
-async function main() {
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store));
- store.session = buildSession({ installDir: '/tmp/app-copy', ci: true });
- await store.runReadyHooks(); // real framework detection
-
- const rec = new WizardRecorder(store, { program: 'posthog-integration' });
- rec.start();
- const driver = new WizardCiDriver(store);
-
- // LOOK: where am I, and what can I commit?
- console.log(
- driver.readState().currentScreen,
- driver.listActions().map((a) => a.id),
- );
-
- // ACT: name an action from list_actions (not a keystroke)
- driver.performAction('confirm_setup');
- console.log(
- driver.readState().currentScreen,
- driver.listActions().map((a) => a.id),
- );
- // …repeat: read_state → decide → perform_action…
-
- // VIEW: render every captured frame as the real TUI (ANSI) so you can see it
- rec.stop();
- for (const f of rec.getRecording().frames) {
- console.log(
- `\n=== ${f.screen} ===\n` + renderFrame(f, Program.PostHogIntegration),
- );
- }
-}
-main();
-```
-
-`auth` and `run` are _external_ steps (the runner sets credentials, the agent
-sets run state) — for those, drive the full `--e2e` path in §1, which runs the
-real agent and records it for you.
-
-## 3. Snapshot for yourself to view
-
-Two ways to "see" a screen as an agent:
-
-- **`renderFrame(frame, program)`** → the real Ink screen as an ANSI string you
- can print and read (used above). Strip ANSI if you want plain text.
-- **The recording JSON** — each frame already carries `screen`, `tasks`,
- `statusMessages`, and the (secret-redacted) `session`, so you can assert on
- what happened without rendering.
-
-The access token is redacted in both `read_state` and recordings, so anything
-you capture is safe to share.
-
-## Key facts
-
-- **State → screen.** You never navigate; you flip a session flag (via an
- action's store setter) and the router re-derives the active screen. Name
- actions, not keys.
-- **Secrets stay out.** `read_state` reduces credentials to `hasCredentials` +
- `projectId`; the token is never serialized.
-- **None of this ships.** The harness lives in `e2e-harness/`, out of `src/`,
- and is absent from the production bundle.
From 9d49433f9230a6a4705a43206cfeac65de4383d9 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:23:52 -0400
Subject: [PATCH 20/38] feat(e2e-harness): live MCP server so an agent drives
the wizard turn-by-turn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
scripts/wizard-ci-mcp.no-jest.ts is a stdio MCP server over one live WizardStore:
read_state / list_actions / perform_action / render_screen / run_agent. An agent
registers it and makes every decision live, instead of the static scripted run.
Rewrite the exploring-the-wizard skill to lead with this. Bump zod ^3.24→^3.25
(the MCP SDK needs the zod/v3 subpath; non-breaking) and add the SDK as a dep.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 101 +++++++++++
package.json | 5 +-
pnpm-lock.yaml | 69 +++----
scripts/README.md | 11 +-
scripts/wizard-ci-mcp.no-jest.ts | 180 +++++++++++++++++++
5 files changed, 326 insertions(+), 40 deletions(-)
create mode 100644 .claude/skills/exploring-the-wizard/SKILL.md
create mode 100644 scripts/wizard-ci-mcp.no-jest.ts
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
new file mode 100644
index 00000000..cc109520
--- /dev/null
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -0,0 +1,101 @@
+---
+name: exploring-the-wizard
+description:
+ Run, drive, and explore the PostHog wizard headlessly against an app —
+ manipulate its state turn-by-turn over MCP (read_state / perform_action /
+ run_agent), capturing TUI snapshots to view. Use when you want to test or
+ explore the wizard end-to-end without a terminal.
+compatibility: Designed for Claude Code working on the PostHog wizard codebase.
+metadata:
+ author: posthog
+ version: '2.0'
+---
+
+# Exploring the wizard as an agent
+
+Drive a real wizard run headlessly and **manipulate its state as it happens** —
+read the current screen, make the user's decision, fire the agent, snapshot the
+TUI — all over MCP. The control plane lives in `e2e-harness/`; for _how_ it
+works underneath, read
+[`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
+
+## 0. Ask for the key, set up
+
+**First, ask the user for the path to their PostHog key file** — e.g. "What's
+the absolute path to your phx key file?" — plus the project id and region if you
+don't have them. Clone/point at the app you'll run against as a **throwaway
+`/tmp` copy** (never a real fixture). Note `WIZARD_PATH` (this repo). Never
+print or commit the key — pass it by file path, below.
+
+## 1. Drive it live over MCP (do this)
+
+Register the `wizard-ci-mcp` server. It holds **one live `WizardStore`** for the
+app and exposes it, so you drive every decision yourself:
+
+```bash
+claude mcp add wizard-ci \
+ -e APP_DIR=/tmp/ \
+ -e POSTHOG_KEY_FILE= \
+ -e PROJECT_ID= \
+ -e POSTHOG_REGION=us \
+ -- npx tsx "$WIZARD_PATH/scripts/wizard-ci-mcp.no-jest.ts"
+```
+
+`APP_DIR` is any directory — so for an **external repo**, clone it to `/tmp` and
+point `APP_DIR` at it (this is how you explore an arbitrary app, not just the
+ones in `wizard-workbench/apps/`).
+
+Then drive it turn by turn with the tools:
+
+- **`read_state`** — current screen, run phase, secret-free session,
+ tasks/status, pending question, and the actions legal right now. Call first
+ and after each move.
+- **`perform_action {action, params?}`** — commit a decision: `confirm_setup`,
+ `dismiss_outage`, `choose` (a setup question), `set_mcp_outcome`,
+ `dismiss_slack`, `keep_skills`. The action must appear in
+ `read_state.actions`.
+- **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
+- **`run_agent`** — on the `run` screen, run the **real integration agent**
+ (blocks minutes); returns the final `runPhase` and next screen.
+
+A typical walk:
+
+```
+read_state → intro → perform_action confirm_setup
+read_state → health-check → perform_action dismiss_outage
+read_state → setup (if asked) → perform_action choose {key,value}
+read_state → run → run_agent (the real integration)
+read_state → outro → perform_action dismiss_outro
+read_state → mcp → perform_action set_mcp_outcome {outcome:"skipped"}
+read_state → slack-connect → perform_action dismiss_slack
+read_state → keep-skills → perform_action keep_skills {kept:false}
+```
+
+`render_screen` whenever you want to see what the user would. The token is
+redacted in `read_state` and `render_screen`, so anything you capture is safe to
+share.
+
+## 2. Or run it hands-off (scripted)
+
+If you don't want to make the decisions, run the scripted profile end to end
+(for apps under `wizard-workbench/apps/`):
+
+```bash
+pnpm wizard-ci --e2e # real agent, headless; writes a recording
+pnpm wizard-ci-snapshots # renders each key moment → .ans + report.html
+```
+
+Replay it:
+`pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --step`.
+
+## Key facts
+
+- **State → screen.** You never navigate; you commit a decision (an action's
+ store setter) and the router re-derives the active screen. Name actions, not
+ keys.
+- **`run` is the only blocking step.** Everything else is an instant store
+ commit; `run_agent` is the real, billable integration.
+- **A green run ≠ a valid integration.** `runPhase=completed` means the flow
+ finished, not that the wizard understood the framework (e.g. it'll treat a
+ Wasp app as react-router). Read what it actually changed.
+- **None of this ships.** The harness lives in `e2e-harness/`, out of `src/`.
diff --git a/package.json b/package.json
index 11599eb8..6578d6c0 100644
--- a/package.json
+++ b/package.json
@@ -54,7 +54,7 @@
"xcode": "3.0.1",
"xml-js": "^1.6.11",
"yargs": "^16.2.0",
- "zod": "^3.24.2",
+ "zod": "^3.25.0",
"zod-to-json-schema": "^3.24.3"
},
"devDependencies": {
@@ -91,7 +91,8 @@
"ts-node": "^10.9.1",
"tsdown": "^0.21.9",
"tsx": "^4.20.3",
- "typescript": "^5.0.4"
+ "typescript": "^5.0.4",
+ "@modelcontextprotocol/sdk": "^1.29.0"
},
"engines": {
"node": "^20.20.0 || >=22.22.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 336708a7..e3c8aef5 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -10,13 +10,13 @@ importers:
dependencies:
'@anthropic-ai/claude-agent-sdk':
specifier: 0.3.169
- version: 0.3.169(@anthropic-ai/sdk@0.81.0(zod@3.24.2))(@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.24.2))(zod@3.24.2)
+ version: 0.3.169(@anthropic-ai/sdk@0.81.0(zod@3.25.76))(@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.25.76))(zod@3.25.76)
'@inkjs/ui':
specifier: ^2.0.0
version: 2.0.0(ink@6.8.0(@types/react@19.2.14)(react@19.2.4))
'@langchain/core':
specifier: ^0.3.40
- version: 0.3.40(openai@6.7.0(ws@8.18.1)(zod@3.24.2))
+ version: 0.3.40(openai@6.7.0(ws@8.18.1)(zod@3.25.76))
axios:
specifier: 1.7.4
version: 1.7.4
@@ -75,11 +75,11 @@ importers:
specifier: ^16.2.0
version: 16.2.0
zod:
- specifier: ^3.24.2
- version: 3.24.2
+ specifier: ^3.25.0
+ version: 3.25.76
zod-to-json-schema:
specifier: ^3.24.3
- version: 3.24.3(zod@3.24.2)
+ version: 3.24.3(zod@3.25.76)
devDependencies:
'@babel/core':
specifier: ^7.29.0
@@ -93,6 +93,9 @@ importers:
'@babel/types':
specifier: ~7.21.4
version: 7.21.5
+ '@modelcontextprotocol/sdk':
+ specifier: ^1.29.0
+ version: 1.29.0(@cfworker/json-schema@4.1.1)(zod@3.25.76)
'@types/chai':
specifier: ^4.3.17
version: 4.3.20
@@ -4099,8 +4102,8 @@ packages:
peerDependencies:
zod: ^3.25.28 || ^4
- zod@3.24.2:
- resolution: {integrity: sha512-lY7CDW43ECgW9u1TcT3IoXHflywfVqDYze4waEz812jR/bZ8FHDsl7pFQoSZTz5N+2NqRXs8GBwnAwo3ZNxqhQ==}
+ zod@3.25.76:
+ resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==}
snapshots:
@@ -4133,11 +4136,11 @@ snapshots:
'@anthropic-ai/claude-agent-sdk-win32-x64@0.3.169':
optional: true
- '@anthropic-ai/claude-agent-sdk@0.3.169(@anthropic-ai/sdk@0.81.0(zod@3.24.2))(@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.24.2))(zod@3.24.2)':
+ '@anthropic-ai/claude-agent-sdk@0.3.169(@anthropic-ai/sdk@0.81.0(zod@3.25.76))(@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.25.76))(zod@3.25.76)':
dependencies:
- '@anthropic-ai/sdk': 0.81.0(zod@3.24.2)
- '@modelcontextprotocol/sdk': 1.29.0(@cfworker/json-schema@4.1.1)(zod@3.24.2)
- zod: 3.24.2
+ '@anthropic-ai/sdk': 0.81.0(zod@3.25.76)
+ '@modelcontextprotocol/sdk': 1.29.0(@cfworker/json-schema@4.1.1)(zod@3.25.76)
+ zod: 3.25.76
optionalDependencies:
'@anthropic-ai/claude-agent-sdk-darwin-arm64': 0.3.169
'@anthropic-ai/claude-agent-sdk-darwin-x64': 0.3.169
@@ -4148,11 +4151,11 @@ snapshots:
'@anthropic-ai/claude-agent-sdk-win32-arm64': 0.3.169
'@anthropic-ai/claude-agent-sdk-win32-x64': 0.3.169
- '@anthropic-ai/sdk@0.81.0(zod@3.24.2)':
+ '@anthropic-ai/sdk@0.81.0(zod@3.25.76)':
dependencies:
json-schema-to-ts: 3.1.1
optionalDependencies:
- zod: 3.24.2
+ zod: 3.25.76
'@babel/code-frame@7.26.2':
dependencies:
@@ -5081,7 +5084,7 @@ snapshots:
'@eslint/eslintrc@2.1.4':
dependencies:
ajv: 6.12.6
- debug: 4.4.0
+ debug: 4.4.3
espree: 9.6.1
globals: 13.24.0
ignore: 5.3.2
@@ -5101,7 +5104,7 @@ snapshots:
'@humanwhocodes/config-array@0.13.0':
dependencies:
'@humanwhocodes/object-schema': 2.0.3
- debug: 4.4.0
+ debug: 4.4.3
minimatch: 3.1.2
transitivePeerDependencies:
- supports-color
@@ -5353,24 +5356,24 @@ snapshots:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.0
- '@langchain/core@0.3.40(openai@6.7.0(ws@8.18.1)(zod@3.24.2))':
+ '@langchain/core@0.3.40(openai@6.7.0(ws@8.18.1)(zod@3.25.76))':
dependencies:
'@cfworker/json-schema': 4.1.1
ansi-styles: 5.2.0
camelcase: 6.3.0
decamelize: 1.2.0
js-tiktoken: 1.0.19
- langsmith: 0.3.11(openai@6.7.0(ws@8.18.1)(zod@3.24.2))
+ langsmith: 0.3.11(openai@6.7.0(ws@8.18.1)(zod@3.25.76))
mustache: 4.2.0
p-queue: 6.6.2
p-retry: 4.6.2
uuid: 10.0.0
- zod: 3.24.2
- zod-to-json-schema: 3.24.3(zod@3.24.2)
+ zod: 3.25.76
+ zod-to-json-schema: 3.24.3(zod@3.25.76)
transitivePeerDependencies:
- openai
- '@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.24.2)':
+ '@modelcontextprotocol/sdk@1.29.0(@cfworker/json-schema@4.1.1)(zod@3.25.76)':
dependencies:
'@hono/node-server': 1.19.14(hono@4.12.18)
ajv: 8.20.0
@@ -5387,8 +5390,8 @@ snapshots:
json-schema-typed: 8.0.2
pkce-challenge: 5.0.1
raw-body: 3.0.2
- zod: 3.24.2
- zod-to-json-schema: 3.25.2(zod@3.24.2)
+ zod: 3.25.76
+ zod-to-json-schema: 3.25.2(zod@3.25.76)
optionalDependencies:
'@cfworker/json-schema': 4.1.1
transitivePeerDependencies:
@@ -5732,7 +5735,7 @@ snapshots:
dependencies:
'@typescript-eslint/typescript-estree': 5.62.0(typescript@5.7.3)
'@typescript-eslint/utils': 5.62.0(eslint@8.57.1)(typescript@5.7.3)
- debug: 4.4.0
+ debug: 4.4.3
eslint: 8.57.1
tsutils: 3.21.0(typescript@5.7.3)
optionalDependencies:
@@ -5746,7 +5749,7 @@ snapshots:
dependencies:
'@typescript-eslint/types': 5.62.0
'@typescript-eslint/visitor-keys': 5.62.0
- debug: 4.4.0
+ debug: 4.4.3
globby: 11.1.0
is-glob: 4.0.3
semver: 7.7.1
@@ -7286,7 +7289,7 @@ snapshots:
kleur@3.0.3: {}
- langsmith@0.3.11(openai@6.7.0(ws@8.18.1)(zod@3.24.2)):
+ langsmith@0.3.11(openai@6.7.0(ws@8.18.1)(zod@3.25.76)):
dependencies:
'@types/uuid': 10.0.0
chalk: 4.1.2
@@ -7296,7 +7299,7 @@ snapshots:
semver: 7.7.1
uuid: 10.0.0
optionalDependencies:
- openai: 6.7.0(ws@8.18.1)(zod@3.24.2)
+ openai: 6.7.0(ws@8.18.1)(zod@3.25.76)
leven@3.1.0: {}
@@ -7517,10 +7520,10 @@ snapshots:
dependencies:
mimic-function: 5.0.1
- openai@6.7.0(ws@8.18.1)(zod@3.24.2):
+ openai@6.7.0(ws@8.18.1)(zod@3.25.76):
optionalDependencies:
ws: 8.18.1
- zod: 3.24.2
+ zod: 3.25.76
optional: true
opn@5.5.0:
@@ -8342,12 +8345,12 @@ snapshots:
yoga-layout@3.2.1: {}
- zod-to-json-schema@3.24.3(zod@3.24.2):
+ zod-to-json-schema@3.24.3(zod@3.25.76):
dependencies:
- zod: 3.24.2
+ zod: 3.25.76
- zod-to-json-schema@3.25.2(zod@3.24.2):
+ zod-to-json-schema@3.25.2(zod@3.25.76):
dependencies:
- zod: 3.24.2
+ zod: 3.25.76
- zod@3.24.2: {}
+ zod@3.25.76: {}
diff --git a/scripts/README.md b/scripts/README.md
index a3b14dad..de1d0538 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -7,11 +7,12 @@ standalone `tsx` entry, named `*.no-jest.ts` so Jest ignores it.
Run from the repo root, e.g. `npx tsx scripts/.no-jest.ts`.
-| Script | What it does | Needs |
-| --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
-| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result + a recording. | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
-| **`render-snapshots.no-jest.ts`** | Renders a recording's key-moment frames to per-frame `.ans` snapshots (real Ink → ANSI). Feeds the workbench visual-regression flow. | a `recording.json` + outDir |
-| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance) or `--delay ` (auto-play). | a `recording.json` |
+| Script | What it does | Needs |
+| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
+| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result + a recording. | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
+| **`render-snapshots.no-jest.ts`** | Renders a recording's key-moment frames to per-frame `.ans` snapshots (real Ink → ANSI). Feeds the workbench visual-regression flow. | a `recording.json` + outDir |
+| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance) or `--delay ` (auto-play). | a `recording.json` |
+| **`wizard-ci-mcp.no-jest.ts`** | A stdio **MCP server** over one live store: an agent drives a real run turn-by-turn (`read_state` / `perform_action` / `run_agent` / `render_screen`). See the `exploring-the-wizard` skill. | `APP_DIR`, `POSTHOG_KEY_FILE` (or key), `PROJECT_ID` |
> You usually don't call these directly — `pnpm wizard-ci --e2e` and
> `pnpm wizard-ci-snapshots` (in
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
new file mode 100644
index 00000000..750d4273
--- /dev/null
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -0,0 +1,180 @@
+/**
+ * wizard-ci-mcp — a standalone stdio MCP server that holds ONE live WizardStore
+ * and exposes it, so an external agent (Claude Code) drives a real wizard run
+ * turn by turn: read_state → perform_action → … → run_agent → … → keep_skills,
+ * rendering the screen whenever it wants. Unlike e2e-full-run (which drives
+ * itself via the scripted profile), here the connected agent makes every choice.
+ *
+ * APP_DIR=/tmp/app POSTHOG_KEY_FILE=/path/to/phx.txt PROJECT_ID=… \
+ * npx tsx scripts/wizard-ci-mcp.no-jest.ts # speaks MCP on stdio
+ *
+ * Tools: read_state, list_actions, perform_action, render_screen, run_agent.
+ * stdout is the JSON-RPC channel — diagnostics go to stderr only.
+ */
+import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import { z } from 'zod';
+import fs from 'fs';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession, type WizardSession } from '@lib/wizard-session';
+import { Program } from '@lib/programs/program-registry';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { runAgent } from '@lib/agent/agent-runner';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
+import { renderFrame } from '@e2e-harness/replay';
+import type { RecordedFrame } from '@e2e-harness/recorder';
+
+const text = (data: unknown) => ({
+ content: [
+ {
+ type: 'text' as const,
+ text: typeof data === 'string' ? data : JSON.stringify(data, null, 2),
+ },
+ ],
+});
+
+async function main() {
+ // The key can come inline or from a file path (keeps the secret out of the
+ // MCP config the agent registers).
+ const apiKey = (
+ process.env.POSTHOG_PERSONAL_API_KEY ??
+ (process.env.POSTHOG_KEY_FILE
+ ? fs.readFileSync(process.env.POSTHOG_KEY_FILE, 'utf8')
+ : '')
+ ).trim();
+ const appDir = process.env.APP_DIR ?? '';
+ const projectId =
+ process.env.PROJECT_ID ?? process.env.POSTHOG_WIZARD_PROJECT_ID ?? '';
+ const region = process.env.POSTHOG_REGION ?? 'us';
+ if (!apiKey)
+ throw new Error('POSTHOG_PERSONAL_API_KEY or POSTHOG_KEY_FILE required');
+ if (!appDir || !fs.existsSync(appDir))
+ throw new Error(`APP_DIR missing or not found: ${appDir}`);
+
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store)); // real UI, never rendered → no stdout
+ store.session = buildSession({
+ installDir: appDir,
+ ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
+ apiKey,
+ projectId,
+ region,
+ });
+ await store.runReadyHooks(); // framework detection
+ store.runInitHooks(); // health-check readiness probe
+ const driver = new WizardCiDriver(store);
+
+ /** Render the current screen to ANSI (access token redacted). */
+ const renderNow = (): string => {
+ const s = store.session;
+ const session: WizardSession = s.credentials
+ ? {
+ ...s,
+ credentials: { ...s.credentials, accessToken: 'phx_***redacted***' },
+ }
+ : s;
+ const frame: RecordedFrame = {
+ seq: 0,
+ ms: 0,
+ triggers: ['screen'],
+ screen: store.currentScreen,
+ hasOverlay: store.router.hasOverlay,
+ session,
+ tasks: store.tasks.map((t) => ({
+ label: t.label,
+ status: t.status,
+ activeForm: t.activeForm,
+ done: t.done,
+ })),
+ statusMessages: [...store.statusMessages],
+ eventPlan: store.eventPlan.map((e) => ({
+ name: e.name,
+ description: e.description,
+ })),
+ };
+ return renderFrame(frame, Program.PostHogIntegration);
+ };
+
+ const server = new McpServer({ name: 'wizard-ci', version: '1.0.0' });
+
+ server.tool(
+ 'read_state',
+ "Read the wizard's committed state: current screen, run phase, a secret-free session view, agent tasks/status, any pending question, unresolved setup questions, and the actions legal right now. Call first and after every perform_action.",
+ {},
+ async () => text(driver.readState()),
+ );
+
+ server.tool(
+ 'list_actions',
+ 'List the commit actions legal on the current screen, with their params.',
+ {},
+ async () =>
+ text({
+ currentScreen: driver.readState().currentScreen,
+ actions: driver.listActions(),
+ }),
+ );
+
+ server.tool(
+ 'perform_action',
+ 'Commit a decision: invoke a legal action for the current screen (e.g. confirm_setup, dismiss_outage, choose, set_mcp_outcome, dismiss_slack, keep_skills). Returns the next state. The action must appear in read_state.actions.',
+ {
+ action: z.string().describe('Action id from read_state.actions'),
+ params: z
+ .record(z.string(), z.unknown())
+ .optional()
+ .describe('Action params, e.g. { key: "router", value: "app" }'),
+ },
+ async ({ action, params }) => {
+ try {
+ return text(driver.performAction(action, params ?? {}));
+ } catch (e) {
+ return {
+ content: [
+ {
+ type: 'text' as const,
+ text: `Error: ${e instanceof Error ? e.message : String(e)}`,
+ },
+ ],
+ isError: true,
+ };
+ }
+ },
+ );
+
+ server.tool(
+ 'render_screen',
+ 'Render the current TUI screen to ANSI so you can see exactly what the user would.',
+ {},
+ async () => text(renderNow()),
+ );
+
+ server.tool(
+ 'run_agent',
+ "Run the real wizard integration agent — the `run` screen's work. Blocks until it finishes (minutes), then returns the final runPhase and next screen. Call when read_state shows currentScreen=run.",
+ {},
+ async () => {
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ await runAgent(posthogIntegrationConfig, store.session);
+ return text({
+ runPhase: store.session.runPhase,
+ currentScreen: store.currentScreen,
+ });
+ },
+ );
+
+ process.stderr.write(
+ `wizard-ci-mcp: serving on stdio (app=${appDir}, detected=${
+ store.session.integration ?? '?'
+ })\n`,
+ );
+ await server.connect(new StdioServerTransport());
+}
+
+main().catch((e) => {
+ process.stderr.write(`wizard-ci-mcp fatal: ${e?.stack ?? e}\n`);
+ process.exit(1);
+});
From 59ffdc1312b902e76810214e6daf3ca51c9fcac5 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:25:42 -0400
Subject: [PATCH 21/38] chore: align zod spec to ^3.25.76 (matches the pi stack
#701)
Same resolved version; just the package.json floor, so #701 and #702 don't
conflict on the zod line.
Co-Authored-By: Claude Opus 4.8
---
package.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package.json b/package.json
index 6578d6c0..75a381f7 100644
--- a/package.json
+++ b/package.json
@@ -54,7 +54,7 @@
"xcode": "3.0.1",
"xml-js": "^1.6.11",
"yargs": "^16.2.0",
- "zod": "^3.25.0",
+ "zod": "^3.25.76",
"zod-to-json-schema": "^3.24.3"
},
"devDependencies": {
From 217e7177b338122698016724b8cdedccd1f75f56 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:28:50 -0400
Subject: [PATCH 22/38] refactor(e2e-harness): drop redundant list_actions from
the MCP server
read_state already returns the legal actions, so the separate tool is noise.
Keeps the server's surface minimal: read_state, perform_action, render_screen,
run_agent.
Co-Authored-By: Claude Opus 4.8
---
scripts/wizard-ci-mcp.no-jest.ts | 13 +------------
1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 750d4273..daef748d 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -8,7 +8,7 @@
* APP_DIR=/tmp/app POSTHOG_KEY_FILE=/path/to/phx.txt PROJECT_ID=… \
* npx tsx scripts/wizard-ci-mcp.no-jest.ts # speaks MCP on stdio
*
- * Tools: read_state, list_actions, perform_action, render_screen, run_agent.
+ * Tools: read_state, perform_action, render_screen, run_agent.
* stdout is the JSON-RPC channel — diagnostics go to stderr only.
*/
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
@@ -106,17 +106,6 @@ async function main() {
async () => text(driver.readState()),
);
- server.tool(
- 'list_actions',
- 'List the commit actions legal on the current screen, with their params.',
- {},
- async () =>
- text({
- currentScreen: driver.readState().currentScreen,
- actions: driver.listActions(),
- }),
- );
-
server.tool(
'perform_action',
'Commit a decision: invoke a legal action for the current screen (e.g. confirm_setup, dismiss_outage, choose, set_mcp_outcome, dismiss_slack, keep_skills). Returns the next state. The action must appear in read_state.actions.',
From 9b870f36c5549b1fba22e9148e3571bc0d5c78d7 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:39:22 -0400
Subject: [PATCH 23/38] docs: revert prettier reflow of README + AGENTS, keep
only the real change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Running prettier on these (not in lint-staged) reflowed the whole files — pure
diff noise. Restore them to main and re-apply just the intended edits: the
"Explore with an agent" section + the exploring-the-wizard skill row.
---
AGENTS.md | 3 +-
README.md | 283 +++++++++++++++++++++++-------------------------------
2 files changed, 124 insertions(+), 162 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index 68189191..dc058a83 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -31,7 +31,7 @@ boundaries, screen resolution
## Skills available
-Four skills live under `.claude/skills/`. Read `wizard-development` first for any structural change; then load the relevant procedural skill:
+Five skills live under `.claude/skills/`. Read `wizard-development` first for any structural change; then load the relevant procedural skill:
| Skill | When to use |
|---|---|
@@ -39,6 +39,7 @@ Four skills live under `.claude/skills/`. Read `wizard-development` first for an
| `adding-framework-support` | Adding a new framework integration (e.g. Ruby on Rails, Go, Angular). |
| `adding-skill-program` | Adding a new skill-based program (e.g. a new product feature setup). |
| `ink-tui` | Building or modifying TUI screens, layouts, and primitives. |
+| `exploring-the-wizard` | Running/driving/exploring the wizard headlessly (read_state/perform_action, TUI snapshots). |
## CLI command surface
diff --git a/README.md b/README.md
index cbb99f3e..1942c84f 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
-> have any feedback, please drop an email to
-> **[wizard@posthog.com](mailto:wizard@posthog.com)**.
+
+> have any feedback, please drop an email to **[wizard@posthog.com](mailto:wizard@posthog.com)**.
PostHog wizard ✨
@@ -19,36 +19,22 @@ To use the wizard, you can run it directly using:
npx @posthog/wizard
```
-Currently the wizard can be used for over 16+ frameworks for frontend, backend,
-and mobile applications. If you have other integrations you would like the
-wizard to support, please open a
-[GitHub issue](https://github.com/posthog/wizard/issues)!
+Currently the wizard can be used for over 16+ frameworks for frontend, backend, and mobile applications. If you have other integrations you would like the wizard to
+support, please open a [GitHub issue](https://github.com/posthog/wizard/issues)!
-Visit our [docs](https://posthog.com/docs/ai-engineering/ai-wizard) to learn
-more.
+Visit our [docs](https://posthog.com/docs/ai-engineering/ai-wizard) to learn more.
## Privacy & data usage
-The wizard uses **Anthropic Claude** (via PostHog's LLM gateway) to read your
-project's source files and integrate PostHog. A few things worth knowing up
-front:
+The wizard uses **Anthropic Claude** (via PostHog's LLM gateway) to read your project's source files and integrate PostHog. A few things worth knowing up front:
- **Source files** are sent to Anthropic as part of the agent's context.
-- **`.env*` files and secrets** stay on your machine. The wizard's security
- scanner blocks anything it identifies as a secret from being read by the
- agent.
-- **Telemetry** (run metadata — phase, task list, planned events) is sent to
- PostHog by default. Pass `--no-telemetry` (or set
- `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
-- **AI opt-in**: the wizard honors your PostHog organization's
- `is_ai_data_processing_approved` setting (the same toggle that gates Max). If
- your org has not opted in, the wizard explains how to enable it and exits
- without sending source to Anthropic.
-- **Prefer your own AI?** The wizard's integration knowledge ships as a
- context-mill skill you can download and run inside your own agent.
-
-The wizard's "Privacy & data usage" menu (intro screen) and the `[I]` shortcut
-on the auth screen surface the same information in-terminal.
+- **`.env*` files and secrets** stay on your machine. The wizard's security scanner blocks anything it identifies as a secret from being read by the agent.
+- **Telemetry** (run metadata — phase, task list, planned events) is sent to PostHog by default. Pass `--no-telemetry` (or set `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
+- **AI opt-in**: the wizard honors your PostHog organization's `is_ai_data_processing_approved` setting (the same toggle that gates Max). If your org has not opted in, the wizard explains how to enable it and exits without sending source to Anthropic.
+- **Prefer your own AI?** The wizard's integration knowledge ships as a context-mill skill you can download and run inside your own agent.
+
+The wizard's "Privacy & data usage" menu (intro screen) and the `[I]` shortcut on the auth screen surface the same information in-terminal.
## MCP Commands
@@ -66,8 +52,8 @@ npx @posthog/wizard mcp remove
## Audit
Audit an existing PostHog integration for correctness and best practices. The
-`audit` command is a **family**. With no subcommand it runs the **events** audit
-(the default); pass a subcommand to run a specific one:
+`audit` command is a **family**. With no subcommand it runs the **events**
+audit (the default); pass a subcommand to run a specific one:
```bash
# Runs the events audit (the default) — no subcommand needed
@@ -86,12 +72,12 @@ npx @posthog/wizard audit web-analytics # web analytics setup
Most audit subcommands resolve at runtime from the published skill registry, so
new audits appear without a wizard release (`web-analytics` is wizard-native).
-> **`audit ` chooses an audit area — it does not take a skill
-> name.** The audit subcommands above _are_ context-mill skills promoted to
-> commands (via a `cli: role: command` block);
-> [`wizard skill `](#run-a-single-skill) runs a skill that hasn't
-> been promoted. Same machinery, two surfaces. (`wizard audit --help` still
-> labels the positional `[skill]` — read it as "pick a subcommand.")
+> **`audit ` chooses an audit area — it does not take a skill name.**
+> The audit subcommands above *are* context-mill skills promoted to commands (via
+> a `cli: role: command` block); [`wizard skill `](#run-a-single-skill)
+> runs a skill that hasn't been promoted. Same machinery, two surfaces.
+> (`wizard audit --help` still labels the positional `[skill]` — read it as "pick
+> a subcommand.")
## Run a single skill
@@ -129,13 +115,13 @@ OAuth sources open the PostHog app's new-source flow in your browser.
## Headless signup + install (agents / CI)
-> ⚠️ `--ci` is **not currently supported in published builds** (see
-> [CI Mode](#ci-mode)). This flow works in development builds only.
+> ⚠️ `--ci` is **not currently supported in published builds** (see [CI Mode](#ci-mode)).
+> This flow works in development builds only.
-For a fully non-interactive first-run (no existing PostHog account, no TTY, no
-browser), combine `--ci --signup --email`. The wizard provisions a new account,
-uses the returned personal API key to run the normal CI install, and wires
-PostHog into the project at `--install-dir`:
+For a fully non-interactive first-run (no existing PostHog account, no TTY,
+no browser), combine `--ci --signup --email`. The wizard provisions a new
+account, uses the returned personal API key to run the normal CI install,
+and wires PostHog into the project at `--install-dir`:
```bash
npx @posthog/wizard --ci --signup \
@@ -161,37 +147,38 @@ npx @posthog/wizard provision --email user@example.com --region eu --json
```
Success prints the full `ProvisioningResult` (`projectApiKey`, `host`,
-`projectId`, `accountId`, `accessToken`, `refreshToken`, and `personalApiKey` if
-present). Failure exits 1; in `--json` mode the error is emitted to stderr as
-`{"error":"...","code":"..."}`, with `code` set to `email_exists` when the
-address is already registered.
+`projectId`, `accountId`, `accessToken`, `refreshToken`, and
+`personalApiKey` if present). Failure exits 1; in `--json` mode the error
+is emitted to stderr as `{"error":"...","code":"..."}`, with `code` set to
+`email_exists` when the address is already registered.
-> ⚠️ **Output contains live credentials.** Pipe it into a secrets store — do not
-> let it be captured by shared CI logs. Mask the step output or redirect stdout
-> to a file your job reads and discards.
+> ⚠️ **Output contains live credentials.** Pipe it into a secrets store —
+> do not let it be captured by shared CI logs. Mask the step output or
+> redirect stdout to a file your job reads and discards.
# Options
The following CLI arguments are available:
-| Option | Description | Type | Default | Choices | Environment Variable |
-| ---------------- | ----------------------------------------------------- | ------- | ------- | ------- | ----------------------------- |
-| `--help` | Show help | boolean | | | |
-| `--version` | Show version number | boolean | | | |
-| `--debug` | Enable verbose logging | boolean | `false` | | `POSTHOG_WIZARD_DEBUG` |
-| `--signup` | Create a new PostHog account during setup | boolean | `false` | | `POSTHOG_WIZARD_SIGNUP` |
-| `--install-dir` | Directory to install PostHog in | string | | | `POSTHOG_WIZARD_INSTALL_DIR` |
-| `--ci` | Enable CI mode for non-interactive execution | boolean | `false` | | `POSTHOG_WIZARD_CI` |
-| `--api-key` | PostHog personal API key (phx_xxx) for authentication | string | | | `POSTHOG_WIZARD_API_KEY` |
-| `--no-telemetry` | Disable wizard run-state telemetry | boolean | `false` | | `POSTHOG_WIZARD_NO_TELEMETRY` |
+| Option | Description | Type | Default | Choices | Environment Variable |
+| ----------------- | ---------------------------------------------------------------- | ------- | ------- | ---------------------------------------------------- | ------------------------------ |
+| `--help` | Show help | boolean | | | |
+| `--version` | Show version number | boolean | | | |
+| `--debug` | Enable verbose logging | boolean | `false` | | `POSTHOG_WIZARD_DEBUG` |
+| `--signup` | Create a new PostHog account during setup | boolean | `false` | | `POSTHOG_WIZARD_SIGNUP` |
+| `--install-dir` | Directory to install PostHog in | string | | | `POSTHOG_WIZARD_INSTALL_DIR` |
+| `--ci` | Enable CI mode for non-interactive execution | boolean | `false` | | `POSTHOG_WIZARD_CI` |
+| `--api-key` | PostHog personal API key (phx_xxx) for authentication | string | | | `POSTHOG_WIZARD_API_KEY` |
+| `--no-telemetry` | Disable wizard run-state telemetry | boolean | `false` | | `POSTHOG_WIZARD_NO_TELEMETRY` |
+
# CI Mode
> ⚠️ **CI mode is not currently supported in published builds.** PostHog's LLM
-> gateway doesn't yet grant the scopes the wizard needs to personal API keys for
-> most users, so non-interactive `--ci` runs fail at the gateway. The flag is
-> disabled in the published package and exits with an error — run the wizard in
-> an interactive terminal instead (`npx @posthog/wizard`). The notes below
+> gateway doesn't yet grant the scopes the wizard needs to personal API keys
+> for most users, so non-interactive `--ci` runs fail at the gateway. The flag
+> is disabled in the published package and exits with an error — run the wizard
+> in an interactive terminal instead (`npx @posthog/wizard`). The notes below
> describe CI mode as it works in development builds.
Run the wizard non-interactive executions with `--ci`:
@@ -212,10 +199,8 @@ The CLI args override environment variables in CI mode.
### Required Flags for CI Mode
-- `--api-key`: Personal API key (`phx_xxx`) from your
- [PostHog settings](https://app.posthog.com/settings/user-api-keys)
-- `--install-dir`: Directory to install PostHog in (e.g., `.` for current
- directory)
+- `--api-key`: Personal API key (`phx_xxx`) from your [PostHog settings](https://app.posthog.com/settings/user-api-keys)
+- `--install-dir`: Directory to install PostHog in (e.g., `.` for current directory)
### Required API Key Scopes
@@ -229,8 +214,8 @@ When creating your personal API key, ensure it has the following scopes enabled:
### OAuth app scope ceiling
-The wizard's OAuth app on the PostHog side caps the scopes its tokens may carry
-(`OAuthApplication.scopes`). Any scope requested in this repo (see
+The wizard's OAuth app on the PostHog side caps the scopes its tokens may
+carry (`OAuthApplication.scopes`). Any scope requested in this repo (see
`src/lib/oauth/program-scopes.ts`) must be present in that list. Current
ceiling, for bookkeeping:
@@ -243,18 +228,18 @@ user:read,project:read,llm_gateway:read,dashboard:read,dashboard:write,insight:r
The CLI was overhauled to consolidate commands into a smaller, extensible
surface. If you used an older command, here's where it went:
-| Old command | New command | What changed |
-| ----------------------------- | --------------------------- | ------------------------------------------------------ |
-| `wizard integrate` | `wizard` (default flow) | Command removed; the default flow runs the integration |
-| `wizard events-audit` | `wizard audit events` | Now an `audit`-family subcommand |
-| `wizard audit` (single audit) | `wizard audit ` | Now a family; see [Audit](#audit) for the subcommands |
-| `wizard audit-3000` | _removed_ | Retired |
-| `wizard revenue` | `wizard revenue-analytics` | Renamed (old `revenue` removed) |
-| `wizard upload-sourcemaps` | `wizard upload-source-maps` | Renamed; `upload-sourcemaps` still works as an alias |
-
-> **Commands vs. programs:** `integrate` was the _command_; the program behind
-> it is `posthog-integration`, which still exists and now powers the default
-> flow. Other commands depend on it via `requires: ['posthog-integration']`. The
+| Old command | New command | What changed |
+|---|---|---|
+| `wizard integrate` | `wizard` (default flow) | Command removed; the default flow runs the integration |
+| `wizard events-audit` | `wizard audit events` | Now an `audit`-family subcommand |
+| `wizard audit` (single audit) | `wizard audit ` | Now a family; see [Audit](#audit) for the subcommands |
+| `wizard audit-3000` | *removed* | Retired |
+| `wizard revenue` | `wizard revenue-analytics` | Renamed (old `revenue` removed) |
+| `wizard upload-sourcemaps` | `wizard upload-source-maps` | Renamed; `upload-sourcemaps` still works as an alias |
+
+> **Commands vs. programs:** `integrate` was the *command*; the program behind it
+> is `posthog-integration`, which still exists and now powers the default flow.
+> Other commands depend on it via `requires: ['posthog-integration']`. The
> program id is internal — it was never a command you typed.
# Steal this code
@@ -290,8 +275,8 @@ When the user authenticates, the wizard also streams live run state — current
phase, task list, planned events — to `POST /api/projects/{id}/wizard/sessions/`
so the PostHog web app can render real-time progress. Updates are debounced
(250ms) with phase changes flushed immediately; failures fall back silently to
-the wizard's debug log without disturbing the TUI. Pass `--no-telemetry` (or set
-`POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
+the wizard's debug log without disturbing the TUI. Pass `--no-telemetry` (or
+set `POSTHOG_WIZARD_NO_TELEMETRY=1`) to disable.
## Leave rules behind
@@ -326,9 +311,10 @@ users of the wizard, no training delays or other ambiguity.
## Keep secrets out of the LLM
-The wizard somtimes needs to move a secret. The agent orchestrates that journey,
-but the raw value should _never_ enter the LLM conversation, where it would be
-sent to the model provider, written to transcripts, and captured in logs.
+The wizard somtimes needs to move a secret. The agent
+orchestrates that journey, but the raw value should _never_ enter the LLM
+conversation, where it would be sent to the model provider, written to
+transcripts, and captured in logs.
`src/lib/secret-vault.ts` is a small, reusable pattern for exactly this. It's a
session-scoped, in-memory vault: a tool that handles a secret calls `put()` to
@@ -351,50 +337,42 @@ drive the work end to end, but the only thing it ever sees is an opaque handle.
## Build system
-Built with [tsdown](https://tsdown.dev/) (Rolldown). `pnpm build` bundles
-`bin.ts` into ESM chunks in `dist/`, inlining all local source and keeping npm
-dependencies external.
+Built with [tsdown](https://tsdown.dev/) (Rolldown). `pnpm build` bundles `bin.ts` into ESM chunks in `dist/`, inlining all local source and keeping npm dependencies external.
### Environment variables
-**Build-time (locked).** `NODE_ENV` is replaced with `"production"` at compile
-time. It cannot be overridden at runtime. All URLs, OAuth client IDs, and
-dev-mode code paths resolve to their production values unconditionally.
+**Build-time (locked).** `NODE_ENV` is replaced with `"production"` at compile time. It cannot be overridden at runtime. All URLs, OAuth client IDs, and dev-mode code paths resolve to their production values unconditionally.
-To add a new build-time constant, add it to `env` in `tsdown.config.ts` and
-export it from `src/env.ts`.
+To add a new build-time constant, add it to `env` in `tsdown.config.ts` and export it from `src/env.ts`.
-**Runtime (allowlisted).** Runtime env reads go through `runtimeEnv()` in
-`src/env.ts`, which only accepts keys in the `RuntimeEnvKey` union:
+**Runtime (allowlisted).** Runtime env reads go through `runtimeEnv()` in `src/env.ts`, which only accepts keys in the `RuntimeEnvKey` union:
-| Variable | Purpose |
-| ---------------------------------- | --------------------------------- |
-| `POSTHOG_WIZARD_BENCHMARK_CONFIG` | Path to benchmark config file |
-| `POSTHOG_WIZARD_BENCHMARK_FILE` | Output path for benchmark results |
-| `POSTHOG_WIZARD_LOG_DIR` | Log directory override |
-| `POSTHOG_WIZARD_DEBUG` / `DEBUG` | Enable debug output |
-| `MCP_URL` | Override MCP server URL |
-| `POSTHOG_API_KEY` | API key for MCP subprocess auth |
-| `TERM`, `TERM_PROGRAM`, `CI`, etc. | Terminal/platform detection |
-| `APPDATA`, `XDG_CONFIG_HOME` | Platform path resolution |
+| Variable | Purpose |
+|---|---|
+| `POSTHOG_WIZARD_BENCHMARK_CONFIG` | Path to benchmark config file |
+| `POSTHOG_WIZARD_BENCHMARK_FILE` | Output path for benchmark results |
+| `POSTHOG_WIZARD_LOG_DIR` | Log directory override |
+| `POSTHOG_WIZARD_DEBUG` / `DEBUG` | Enable debug output |
+| `MCP_URL` | Override MCP server URL |
+| `POSTHOG_API_KEY` | API key for MCP subprocess auth |
+| `TERM`, `TERM_PROGRAM`, `CI`, etc. | Terminal/platform detection |
+| `APPDATA`, `XDG_CONFIG_HOME` | Platform path resolution |
To add a new runtime env var, add its key to `RuntimeEnvKey` in `src/env.ts`.
-**Direct `process.env` access** is only used for subprocess environment writes
-(e.g. `agent-interface.ts` setting `ANTHROPIC_BASE_URL`), vendored code, and
-tests.
+**Direct `process.env` access** is only used for subprocess environment writes (e.g. `agent-interface.ts` setting `ANTHROPIC_BASE_URL`), vendored code, and tests.
### Import aliases
Path aliases defined in `tsconfig.build.json`, resolved by tsdown:
-| Alias | Maps to |
-| --------------- | ------------------ |
-| `@env` | `src/env.ts` |
-| `@lib/*` | `src/lib/*` |
-| `@utils/*` | `src/utils/*` |
-| `@ui/*` | `src/ui/*` |
-| `@steps/*` | `src/steps/*` |
+| Alias | Maps to |
+|---|---|
+| `@env` | `src/env.ts` |
+| `@lib/*` | `src/lib/*` |
+| `@utils/*` | `src/utils/*` |
+| `@ui/*` | `src/ui/*` |
+| `@steps/*` | `src/steps/*` |
| `@frameworks/*` | `src/frameworks/*` |
## Running locally
@@ -411,8 +389,7 @@ pnpm try --install-dir=[a path]
pnpm run dev
```
-This builds, links globally, and watches for changes. Leave it running - any
-`.ts` file changes will auto-rebuild. Then from any project:
+This builds, links globally, and watches for changes. Leave it running - any `.ts` file changes will auto-rebuild. Then from any project:
```bash
wizard --integration=nextjs
@@ -440,28 +417,18 @@ LLM calls. See the `e2e-tests/README.md` for more information.
#### Explore with an agent
-You can hand the wizard to an AI agent and have it **run, drive, and explore the
-wizard itself** — against any app, headlessly, snapshotting the TUI so it can
-see what happened. The runbook is
-[`e2e-harness/EXPLORING-AS-AN-AGENT.md`](e2e-harness/EXPLORING-AS-AN-AGENT.md):
-it covers driving the flow through the `wizard-ci-tools` control plane
-(`read_state` / `list_actions` / `perform_action`), capturing snapshots with
-`renderFrame`, and the env a run needs.
-
-Point an agent at it with a prompt like — here, exploring against
+You can hand the wizard to an AI agent and have it run, drive, and explore the
+wizard itself — headlessly, snapshotting the TUI so it can see what happened. The
+how-to is the `exploring-the-wizard` skill
+(`.claude/skills/exploring-the-wizard/SKILL.md`), which an agent in this repo
+discovers automatically. Example prompt — explore against
[open-saas](https://github.com/wasp-lang/open-saas):
-> Explore the PostHog wizard against a real app. Read
-> `e2e-harness/EXPLORING-AS-AN-AGENT.md` — your runbook for driving the wizard
-> headlessly, capturing snapshots, and the env you'll need. Ask me for my phx
-> key file path and set up per the runbook. Then clone
-> `https://github.com/wasp-lang/open-saas` into a throwaway `/tmp` copy, work
-> out how to build it, and run the wizard against it — driving the flow,
-> snapshotting each key moment, and rendering the screens back so I can see
-> them. Then tell me what the wizard did: which screens it walked, what it
-> changed, and anything that broke.
-
-The agent works out how to build and run the target itself — that's the point.
+> Explore the PostHog wizard against a real app. Use the `exploring-the-wizard`
+> skill — your runbook for driving the wizard headlessly and snapshotting it. Ask
+> me for my phx key file path, then clone `https://github.com/wasp-lang/open-saas`
+> into a throwaway `/tmp` copy, run the wizard against it driving the flow and
+> snapshotting each key moment, and tell me what it did and anything that broke.
## Publishing your tool
@@ -474,26 +441,26 @@ To make your version of a tool usable with a one-line `npx` command:
# Health checks
-`src/lib/health-checks/` checks external status pages and PostHog-owned services
-before the wizard runs to decide whether it can proceed. The entry point is
-`evaluateWizardReadiness()`, which returns one of three values:
+`src/lib/health-checks/` checks external status pages and PostHog-owned
+services before the wizard runs to decide whether it can proceed. The entry
+point is `evaluateWizardReadiness()`, which returns one of three values:
-| Decision | Meaning |
-| ------------------- | ---------------------------------------------------------- |
-| `yes` | All services healthy — proceed normally. |
-| `yes_with_warnings` | Some services degraded but no critical dependency is down. |
-| `no` | A critical dependency is down or degraded — do not run. |
+| Decision | Meaning |
+| ------------------- | --------------------------------------------------------------- |
+| `yes` | All services healthy — proceed normally. |
+| `yes_with_warnings` | Some services degraded but no critical dependency is down. |
+| `no` | A critical dependency is down or degraded — do not run. |
### Module layout
-| File | Responsibility |
-| --------------- | ------------------------------------------------------------------------------------- |
-| `types.ts` | Enums, interfaces (`ServiceHealthStatus`, `AllServicesHealth`, etc.) |
+| File | Responsibility |
+| --- | --- |
+| `types.ts` | Enums, interfaces (`ServiceHealthStatus`, `AllServicesHealth`, etc.) |
| `statuspage.ts` | Statuspage.io v2 API helpers + checks for Anthropic, PostHog, GitHub, npm, Cloudflare |
-| `endpoints.ts` | Direct endpoint checks for LLM Gateway (`/_liveness`) and MCP (`/`) |
-| `readiness.ts` | `checkAllExternalServices`, `evaluateWizardReadiness`, readiness config |
-| `index.ts` | Barrel re-export |
-| `testme.md` | Test running instructions and endpoint reference |
+| `endpoints.ts` | Direct endpoint checks for LLM Gateway (`/_liveness`) and MCP (`/`) |
+| `readiness.ts` | `checkAllExternalServices`, `evaluateWizardReadiness`, readiness config |
+| `index.ts` | Barrel re-export |
+| `testme.md` | Test running instructions and endpoint reference |
## What blocks a run
@@ -514,19 +481,14 @@ degradedBlocksRun: ['anthropic'],
## Smoke test helper (`scripts/smoke-test-ci.sh`)
-This repo includes a helper script to run a full end‑to‑end smoke test of the
-wizard packaged in a tarball against a real app from
-[`posthog/wizard-workbench`](https://github.com/PostHog/wizard-workbench). This
-will catch certain packaging issues that might not be caught by other tests.
+This repo includes a helper script to run a full end‑to‑end smoke test of the wizard packaged in a tarball against a real app from [`posthog/wizard-workbench`](https://github.com/PostHog/wizard-workbench). This will catch certain packaging issues that might not be caught by other tests.
**Prerequisites**
- Point to a `wizard-workbench` checkout either by:
- Setting `WIZARD_WORKBENCH_ROOT=/absolute/path/to/wizard-workbench`, or
- - Cloning `wizard-workbench` next to this repo (so it lives at
- `../wizard-workbench`).
-- Set `POSTHOG_PERSONAL_API_KEY` either in your shell or in
- `../wizard-workbench/.env`.
+ - Cloning `wizard-workbench` next to this repo (so it lives at `../wizard-workbench`).
+- Set `POSTHOG_PERSONAL_API_KEY` either in your shell or in `../wizard-workbench/.env`.
- (Optional) Set `POSTHOG_PROJECT_ID` to target a specific PostHog project.
**Usage**
@@ -554,5 +516,4 @@ The script will:
- Copy the selected app into a temp directory
- Install dependencies for the app
- Install the packed wizard tarball into an isolated temp project
-- Run `wizard` in `--ci` mode against the copied app and perform basic
- post‑install checks
+- Run `wizard` in `--ci` mode against the copied app and perform basic post‑install checks
From 119dba0c042fe1036e4ac1575922a8b068b8c5b7 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:45:39 -0400
Subject: [PATCH 24/38] =?UTF-8?q?docs:=20fix=20dead=20link=20=E2=80=94=20p?=
=?UTF-8?q?oint=20ARCHITECTURE=20at=20the=20skill,=20not=20the=20deleted?=
=?UTF-8?q?=20runbook?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
EXPLORING-AS-AN-AGENT.md was promoted to .claude/skills/exploring-the-wizard/;
this pointer fix was left uncommitted, so HEAD still linked the deleted file.
---
e2e-harness/ARCHITECTURE.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index 16f0b984..e13ec20e 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -7,9 +7,9 @@ on structured state — not the classic `--ci` mode (LoggingUI, stdout-grep,
agent-only).
> If you're an agent that just wants to **run and explore the wizard** (drive
-> it, view the screens, snapshot it), start with the runbook:
-> [`EXPLORING-AS-AN-AGENT.md`](EXPLORING-AS-AN-AGENT.md). This doc is the _how
-> it works_ underneath.
+> it, view the screens, snapshot it), use the `exploring-the-wizard` skill
+> ([`.claude/skills/exploring-the-wizard/SKILL.md`](../.claude/skills/exploring-the-wizard/SKILL.md)).
+> This doc is the _how it works_ underneath.
## The pieces
From e7129c470a70850588a34fed06162f088d6d0e98 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 18:50:35 -0400
Subject: [PATCH 25/38] =?UTF-8?q?fix(skill):=20correct=20the=20driving=20i?=
=?UTF-8?q?nstructions=20=E2=80=94=20MCP=20tools=20bind=20at=20session=20s?=
=?UTF-8?q?tart?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The skill told agents to `claude mcp add` then immediately call the tools, which
is impossible (MCP servers load at session start), so agents fell back to a
script. Lead with the in-session way that actually works — a WizardCiDriver
script (read_state → perform_action → renderFrame), tested — and document the MCP
server as the interactive option that needs registering before a fresh session.
---
.claude/skills/exploring-the-wizard/SKILL.md | 167 +++++++++++--------
1 file changed, 101 insertions(+), 66 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index cc109520..a6d0c3dc 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -1,101 +1,136 @@
---
name: exploring-the-wizard
-description:
- Run, drive, and explore the PostHog wizard headlessly against an app —
- manipulate its state turn-by-turn over MCP (read_state / perform_action /
- run_agent), capturing TUI snapshots to view. Use when you want to test or
- explore the wizard end-to-end without a terminal.
+description: Run, drive, and explore the PostHog wizard headlessly against an app — decide each screen yourself (read_state / perform_action / run_agent) and snapshot the TUI to view. Use when you want to test or explore the wizard end-to-end without a terminal.
compatibility: Designed for Claude Code working on the PostHog wizard codebase.
metadata:
author: posthog
- version: '2.0'
+ version: "2.1"
---
# Exploring the wizard as an agent
-Drive a real wizard run headlessly and **manipulate its state as it happens** —
-read the current screen, make the user's decision, fire the agent, snapshot the
-TUI — all over MCP. The control plane lives in `e2e-harness/`; for _how_ it
-works underneath, read
+Drive a real wizard run headlessly and decide each step yourself — read the
+current screen, commit a decision, fire the agent, snapshot the TUI. The control
+plane is `WizardCiDriver` (read/act over a live store); for _how_ it works, read
[`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
## 0. Ask for the key, set up
-**First, ask the user for the path to their PostHog key file** — e.g. "What's
-the absolute path to your phx key file?" — plus the project id and region if you
-don't have them. Clone/point at the app you'll run against as a **throwaway
-`/tmp` copy** (never a real fixture). Note `WIZARD_PATH` (this repo). Never
-print or commit the key — pass it by file path, below.
+**First, ask the user for the path to their PostHog key file** — e.g. "What's the
+absolute path to your phx key file?" — plus the project id and region if you don't
+have them. Clone/point at the app as a **throwaway `/tmp` copy** (never a real
+fixture). Note `WIZARD_PATH` (this repo). Never print or commit the key.
+
+## 1. Drive it from a script (works in THIS session)
+
+Write a script that drives the `WizardCiDriver` turn by turn — `readState()` →
+_your_ decision → `performAction()`, `renderFrame()` to see each screen. Put it
+**inside this repo** (so `@lib`/`@e2e-harness` resolve), name it
+`scripts/explore.no-jest.ts`, run `npx tsx scripts/explore.no-jest.ts`, then
+delete it. Run it, read the output, adjust the decisions, re-run.
+
+```ts
+import fs from 'fs';
+import { WizardStore } from '@ui/tui/store';
+import { InkUI } from '@ui/tui/ink-ui';
+import { setUI } from '@ui/index';
+import { buildSession } from '@lib/wizard-session';
+import { Program } from '@lib/programs/program-registry';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { runAgent } from '@lib/agent/agent-runner';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
+import { WizardRecorder } from '@e2e-harness/recorder';
+import { renderFrame } from '@e2e-harness/replay';
+
+async function main() {
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store));
+ store.session = buildSession({
+ installDir: process.env.APP_DIR!,
+ ci: true,
+ apiKey: fs.readFileSync(process.env.POSTHOG_KEY_FILE!, 'utf8').trim(),
+ projectId: process.env.PROJECT_ID!,
+ region: 'us',
+ });
+ await store.runReadyHooks(); // framework detection
+ store.runInitHooks(); // health-check probe
+
+ const rec = new WizardRecorder(store, { program: 'posthog-integration' });
+ rec.start();
+ const driver = new WizardCiDriver(store);
+ const at = () => {
+ const s = driver.readState();
+ console.log(s.currentScreen, s.actions.map((a) => a.id));
+ return s.currentScreen;
+ };
+
+ // YOU decide each screen — read state, then commit a legal action:
+ at(); // intro
+ driver.performAction('confirm_setup');
+ at(); // health-check
+ driver.performAction('dismiss_outage');
+ // setup question? -> driver.performAction('choose', { key, value })
+
+ // the `run` screen = the real integration agent (blocks minutes):
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ await runAgent(posthogIntegrationConfig, store.session);
+
+ // post-run screens:
+ driver.performAction('dismiss_outro');
+ driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
+ driver.performAction('dismiss_slack');
+ driver.performAction('keep_skills', { kept: false });
+
+ // SEE every key moment as the real TUI:
+ rec.stop();
+ for (const f of rec.getRecording().frames) {
+ console.log(`\n=== ${f.screen} ===\n` + renderFrame(f, Program.PostHogIntegration));
+ }
+}
+main();
+```
+
+`APP_DIR` is any directory — so for an **external repo**, clone it to `/tmp` and
+point `APP_DIR` at it.
-## 1. Drive it live over MCP (do this)
+## 2. Drive it as MCP tools (needs a fresh session)
-Register the `wizard-ci-mcp` server. It holds **one live `WizardStore`** for the
-app and exposes it, so you drive every decision yourself:
+`scripts/wizard-ci-mcp.no-jest.ts` is a stdio MCP server over one live store,
+exposing `read_state` / `perform_action` / `render_screen` / `run_agent` as tools
+you call turn-by-turn — the most interactive way. **But MCP tools load at session
+start**, so you cannot add-and-use it in the same session. Register it first, then
+drive in a **new** session:
```bash
-claude mcp add wizard-ci \
+claude mcp add -s project wizard-ci \
-e APP_DIR=/tmp/ \
-e POSTHOG_KEY_FILE= \
-e PROJECT_ID= \
- -e POSTHOG_REGION=us \
-- npx tsx "$WIZARD_PATH/scripts/wizard-ci-mcp.no-jest.ts"
```
-`APP_DIR` is any directory — so for an **external repo**, clone it to `/tmp` and
-point `APP_DIR` at it (this is how you explore an arbitrary app, not just the
-ones in `wizard-workbench/apps/`).
-
-Then drive it turn by turn with the tools:
-
-- **`read_state`** — current screen, run phase, secret-free session,
- tasks/status, pending question, and the actions legal right now. Call first
- and after each move.
-- **`perform_action {action, params?}`** — commit a decision: `confirm_setup`,
- `dismiss_outage`, `choose` (a setup question), `set_mcp_outcome`,
- `dismiss_slack`, `keep_skills`. The action must appear in
- `read_state.actions`.
-- **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
-- **`run_agent`** — on the `run` screen, run the **real integration agent**
- (blocks minutes); returns the final `runPhase` and next screen.
-
-A typical walk:
-
-```
-read_state → intro → perform_action confirm_setup
-read_state → health-check → perform_action dismiss_outage
-read_state → setup (if asked) → perform_action choose {key,value}
-read_state → run → run_agent (the real integration)
-read_state → outro → perform_action dismiss_outro
-read_state → mcp → perform_action set_mcp_outcome {outcome:"skipped"}
-read_state → slack-connect → perform_action dismiss_slack
-read_state → keep-skills → perform_action keep_skills {kept:false}
-```
-
-`render_screen` whenever you want to see what the user would. The token is
-redacted in `read_state` and `render_screen`, so anything you capture is safe to
-share.
+Then start a fresh Claude Code session in this repo and call the tools
+(`read_state` → `perform_action` → … → `run_agent` → … → `keep_skills`,
+`render_screen` to view).
-## 2. Or run it hands-off (scripted)
+## 3. Or run it hands-off (scripted)
-If you don't want to make the decisions, run the scripted profile end to end
-(for apps under `wizard-workbench/apps/`):
+To let the scripted profile make the decisions (for apps under
+`wizard-workbench/apps/`):
```bash
pnpm wizard-ci --e2e # real agent, headless; writes a recording
pnpm wizard-ci-snapshots # renders each key moment → .ans + report.html
```
-Replay it:
-`pnpm wizard-ci --replay /tmp/wizard-e2e-.recording.json --step`.
-
## Key facts
-- **State → screen.** You never navigate; you commit a decision (an action's
- store setter) and the router re-derives the active screen. Name actions, not
- keys.
-- **`run` is the only blocking step.** Everything else is an instant store
- commit; `run_agent` is the real, billable integration.
+- **State → screen.** You never navigate; you commit a decision (an action's store
+ setter) and the router re-derives the active screen. Name actions, not keys.
+- **`run` is the only blocking step.** Everything else is an instant store commit;
+ `run_agent` / `runAgent` is the real, billable integration.
- **A green run ≠ a valid integration.** `runPhase=completed` means the flow
- finished, not that the wizard understood the framework (e.g. it'll treat a
- Wasp app as react-router). Read what it actually changed.
+ finished, not that the wizard understood the framework (e.g. it'll treat a Wasp
+ app as react-router). Read what it actually changed.
- **None of this ships.** The harness lives in `e2e-harness/`, out of `src/`.
From 6dd01e536fa3939b923934fec7f2ce90b71065fd Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 19:00:53 -0400
Subject: [PATCH 26/38] fix(e2e-harness): make the MCP server actually
loadable; skill leads with it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Connect the stdio transport first and build the store lazily on the first tool
call — detection + the networked health probe used to run before connect(), which
could stall the MCP handshake so Claude Code saw the server as broken. Verified
end-to-end: `claude mcp add` → `claude mcp list` shows ✔ Connected → a headless
session drove read_state → perform_action(confirm_setup) → auth → render_screen.
Skill now leads with the two-phase MCP flow (register, then drive in a fresh
session, since MCP tools bind at session start); the driver script is the fallback.
---
.claude/skills/exploring-the-wizard/SKILL.md | 97 ++++++++-------
scripts/wizard-ci-mcp.no-jest.ts | 122 ++++++++++---------
2 files changed, 115 insertions(+), 104 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index a6d0c3dc..6f267b08 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -1,18 +1,17 @@
---
name: exploring-the-wizard
-description: Run, drive, and explore the PostHog wizard headlessly against an app — decide each screen yourself (read_state / perform_action / run_agent) and snapshot the TUI to view. Use when you want to test or explore the wizard end-to-end without a terminal.
+description: Run, drive, and explore the PostHog wizard headlessly against an app — decide each screen yourself over MCP (read_state / perform_action / run_agent) and snapshot the TUI to view. Use when you want to test or explore the wizard end-to-end without a terminal.
compatibility: Designed for Claude Code working on the PostHog wizard codebase.
metadata:
author: posthog
- version: "2.1"
+ version: "2.2"
---
# Exploring the wizard as an agent
Drive a real wizard run headlessly and decide each step yourself — read the
-current screen, commit a decision, fire the agent, snapshot the TUI. The control
-plane is `WizardCiDriver` (read/act over a live store); for _how_ it works, read
-[`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
+current screen, commit a decision, fire the agent, snapshot the TUI. For _how_ it
+works, read [`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
## 0. Ask for the key, set up
@@ -21,13 +20,51 @@ absolute path to your phx key file?" — plus the project id and region if you d
have them. Clone/point at the app as a **throwaway `/tmp` copy** (never a real
fixture). Note `WIZARD_PATH` (this repo). Never print or commit the key.
-## 1. Drive it from a script (works in THIS session)
+## 1. Drive it over MCP
-Write a script that drives the `WizardCiDriver` turn by turn — `readState()` →
-_your_ decision → `performAction()`, `renderFrame()` to see each screen. Put it
-**inside this repo** (so `@lib`/`@e2e-harness` resolve), name it
-`scripts/explore.no-jest.ts`, run `npx tsx scripts/explore.no-jest.ts`, then
-delete it. Run it, read the output, adjust the decisions, re-run.
+The `wizard-ci-mcp` server holds one live store and exposes it as tools. **MCP
+tools load at session start**, so this is two phases — register, then drive in a
+**fresh session**.
+
+**Phase 1 — register (in your current session):**
+
+```bash
+claude mcp add -s project wizard-ci \
+ -e APP_DIR=/tmp/ \
+ -e POSTHOG_KEY_FILE= \
+ -e PROJECT_ID= \
+ -e POSTHOG_REGION=us \
+ -- npx tsx "$WIZARD_PATH/scripts/wizard-ci-mcp.no-jest.ts"
+```
+
+Confirm it loads: `claude mcp list` shows `wizard-ci: … ✔ Connected`. Then tell the
+user to **start a fresh Claude Code session in this repo** (a new tab) — that's
+where the tools live. (`APP_DIR` is any dir, so an external repo works: clone it to
+`/tmp` and point `APP_DIR` at it.)
+
+**Phase 2 — drive (in the fresh session):** the `wizard-ci` tools are now bound.
+Walk the flow, deciding each screen:
+
+- **`read_state`** — current screen, run phase, secret-free session, tasks, the
+ actions legal now. Call first and after every move.
+- **`perform_action {action, params?}`** — `confirm_setup`, `dismiss_outage`,
+ `choose` (a setup question), `set_mcp_outcome`, `dismiss_slack`, `keep_skills`.
+- **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
+- **`run_agent`** — on the `run` screen, the **real integration** (blocks minutes).
+
+```
+read_state → intro → perform_action confirm_setup
+read_state → health-check → perform_action dismiss_outage
+read_state → run → run_agent (the real integration)
+read_state → outro → perform_action dismiss_outro → … → keep_skills
+```
+
+## 2. Drive it from a script (no fresh session needed)
+
+If you can't start a new session, drive the same `WizardCiDriver` from a script —
+`readState()` → your decision → `performAction()`, `renderFrame()` to view. Put it
+inside this repo (so `@lib`/`@e2e-harness` resolve) as `scripts/explore.no-jest.ts`,
+run `npx tsx scripts/explore.no-jest.ts`, then delete it.
```ts
import fs from 'fs';
@@ -52,37 +89,31 @@ async function main() {
projectId: process.env.PROJECT_ID!,
region: 'us',
});
- await store.runReadyHooks(); // framework detection
- store.runInitHooks(); // health-check probe
-
+ await store.runReadyHooks();
+ store.runInitHooks();
const rec = new WizardRecorder(store, { program: 'posthog-integration' });
rec.start();
const driver = new WizardCiDriver(store);
const at = () => {
const s = driver.readState();
console.log(s.currentScreen, s.actions.map((a) => a.id));
- return s.currentScreen;
};
- // YOU decide each screen — read state, then commit a legal action:
at(); // intro
driver.performAction('confirm_setup');
at(); // health-check
driver.performAction('dismiss_outage');
// setup question? -> driver.performAction('choose', { key, value })
- // the `run` screen = the real integration agent (blocks minutes):
await store.getGate('intro');
await store.getGate('health-check');
- await runAgent(posthogIntegrationConfig, store.session);
+ await runAgent(posthogIntegrationConfig, store.session); // the real integration
- // post-run screens:
driver.performAction('dismiss_outro');
driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
driver.performAction('dismiss_slack');
driver.performAction('keep_skills', { kept: false });
- // SEE every key moment as the real TUI:
rec.stop();
for (const f of rec.getRecording().frames) {
console.log(`\n=== ${f.screen} ===\n` + renderFrame(f, Program.PostHogIntegration));
@@ -91,33 +122,9 @@ async function main() {
main();
```
-`APP_DIR` is any directory — so for an **external repo**, clone it to `/tmp` and
-point `APP_DIR` at it.
-
-## 2. Drive it as MCP tools (needs a fresh session)
-
-`scripts/wizard-ci-mcp.no-jest.ts` is a stdio MCP server over one live store,
-exposing `read_state` / `perform_action` / `render_screen` / `run_agent` as tools
-you call turn-by-turn — the most interactive way. **But MCP tools load at session
-start**, so you cannot add-and-use it in the same session. Register it first, then
-drive in a **new** session:
-
-```bash
-claude mcp add -s project wizard-ci \
- -e APP_DIR=/tmp/ \
- -e POSTHOG_KEY_FILE= \
- -e PROJECT_ID= \
- -- npx tsx "$WIZARD_PATH/scripts/wizard-ci-mcp.no-jest.ts"
-```
-
-Then start a fresh Claude Code session in this repo and call the tools
-(`read_state` → `perform_action` → … → `run_agent` → … → `keep_skills`,
-`render_screen` to view).
-
## 3. Or run it hands-off (scripted)
-To let the scripted profile make the decisions (for apps under
-`wizard-workbench/apps/`):
+To let the scripted profile decide (for apps under `wizard-workbench/apps/`):
```bash
pnpm wizard-ci --e2e # real agent, headless; writes a recording
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index daef748d..41555e4d 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -1,15 +1,16 @@
/**
* wizard-ci-mcp — a standalone stdio MCP server that holds ONE live WizardStore
- * and exposes it, so an external agent (Claude Code) drives a real wizard run
- * turn by turn: read_state → perform_action → … → run_agent → … → keep_skills,
- * rendering the screen whenever it wants. Unlike e2e-full-run (which drives
- * itself via the scripted profile), here the connected agent makes every choice.
+ * and exposes it, so an external agent (Claude Code, in a session where this is
+ * registered) drives a real wizard run turn by turn: read_state → perform_action
+ * → … → run_agent → … → keep_skills, rendering the screen whenever it wants.
*
* APP_DIR=/tmp/app POSTHOG_KEY_FILE=/path/to/phx.txt PROJECT_ID=… \
* npx tsx scripts/wizard-ci-mcp.no-jest.ts # speaks MCP on stdio
*
- * Tools: read_state, perform_action, render_screen, run_agent.
- * stdout is the JSON-RPC channel — diagnostics go to stderr only.
+ * The transport connects immediately; the store (framework detection + the
+ * networked health probe) is built lazily on the first tool call, so the MCP
+ * handshake never blocks. Tools: read_state, perform_action, render_screen,
+ * run_agent. stdout is the JSON-RPC channel — diagnostics go to stderr only.
*/
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
@@ -35,9 +36,39 @@ const text = (data: unknown) => ({
],
});
+/** Render a store's current screen to ANSI (access token redacted). */
+function renderNow(store: WizardStore): string {
+ const s = store.session;
+ const session: WizardSession = s.credentials
+ ? {
+ ...s,
+ credentials: { ...s.credentials, accessToken: 'phx_***redacted***' },
+ }
+ : s;
+ const frame: RecordedFrame = {
+ seq: 0,
+ ms: 0,
+ triggers: ['screen'],
+ screen: store.currentScreen,
+ hasOverlay: store.router.hasOverlay,
+ session,
+ tasks: store.tasks.map((t) => ({
+ label: t.label,
+ status: t.status,
+ activeForm: t.activeForm,
+ done: t.done,
+ })),
+ statusMessages: [...store.statusMessages],
+ eventPlan: store.eventPlan.map((e) => ({
+ name: e.name,
+ description: e.description,
+ })),
+ };
+ return renderFrame(frame, Program.PostHogIntegration);
+}
+
async function main() {
- // The key can come inline or from a file path (keeps the secret out of the
- // MCP config the agent registers).
+ // Fast, offline validation — fail clearly before connecting if misconfigured.
const apiKey = (
process.env.POSTHOG_PERSONAL_API_KEY ??
(process.env.POSTHOG_KEY_FILE
@@ -53,49 +84,25 @@ async function main() {
if (!appDir || !fs.existsSync(appDir))
throw new Error(`APP_DIR missing or not found: ${appDir}`);
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store)); // real UI, never rendered → no stdout
- store.session = buildSession({
- installDir: appDir,
- ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
- apiKey,
- projectId,
- region,
- });
- await store.runReadyHooks(); // framework detection
- store.runInitHooks(); // health-check readiness probe
- const driver = new WizardCiDriver(store);
-
- /** Render the current screen to ANSI (access token redacted). */
- const renderNow = (): string => {
- const s = store.session;
- const session: WizardSession = s.credentials
- ? {
- ...s,
- credentials: { ...s.credentials, accessToken: 'phx_***redacted***' },
- }
- : s;
- const frame: RecordedFrame = {
- seq: 0,
- ms: 0,
- triggers: ['screen'],
- screen: store.currentScreen,
- hasOverlay: store.router.hasOverlay,
- session,
- tasks: store.tasks.map((t) => ({
- label: t.label,
- status: t.status,
- activeForm: t.activeForm,
- done: t.done,
- })),
- statusMessages: [...store.statusMessages],
- eventPlan: store.eventPlan.map((e) => ({
- name: e.name,
- description: e.description,
- })),
- };
- return renderFrame(frame, Program.PostHogIntegration);
- };
+ // Build the live store lazily on first use: detection + the networked health
+ // probe are slow, and doing them before connect() would stall the handshake.
+ let ready: Promise<{ store: WizardStore; driver: WizardCiDriver }> | null =
+ null;
+ const live = () =>
+ (ready ??= (async () => {
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store)); // never rendered → no stdout
+ store.session = buildSession({
+ installDir: appDir,
+ ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
+ apiKey,
+ projectId,
+ region,
+ });
+ await store.runReadyHooks(); // framework detection
+ store.runInitHooks(); // health-check readiness probe
+ return { store, driver: new WizardCiDriver(store) };
+ })());
const server = new McpServer({ name: 'wizard-ci', version: '1.0.0' });
@@ -103,7 +110,7 @@ async function main() {
'read_state',
"Read the wizard's committed state: current screen, run phase, a secret-free session view, agent tasks/status, any pending question, unresolved setup questions, and the actions legal right now. Call first and after every perform_action.",
{},
- async () => text(driver.readState()),
+ async () => text((await live()).driver.readState()),
);
server.tool(
@@ -118,7 +125,7 @@ async function main() {
},
async ({ action, params }) => {
try {
- return text(driver.performAction(action, params ?? {}));
+ return text((await live()).driver.performAction(action, params ?? {}));
} catch (e) {
return {
content: [
@@ -137,7 +144,7 @@ async function main() {
'render_screen',
'Render the current TUI screen to ANSI so you can see exactly what the user would.',
{},
- async () => text(renderNow()),
+ async () => text(renderNow((await live()).store)),
);
server.tool(
@@ -145,6 +152,7 @@ async function main() {
"Run the real wizard integration agent — the `run` screen's work. Blocks until it finishes (minutes), then returns the final runPhase and next screen. Call when read_state shows currentScreen=run.",
{},
async () => {
+ const { store } = await live();
await store.getGate('intro');
await store.getGate('health-check');
await runAgent(posthogIntegrationConfig, store.session);
@@ -155,12 +163,8 @@ async function main() {
},
);
- process.stderr.write(
- `wizard-ci-mcp: serving on stdio (app=${appDir}, detected=${
- store.session.integration ?? '?'
- })\n`,
- );
await server.connect(new StdioServerTransport());
+ process.stderr.write(`wizard-ci-mcp: ready on stdio (app=${appDir})\n`);
}
main().catch((e) => {
From f702da7c05c6fa9e96ab44fc8244e3699adba02e Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 19:37:28 -0400
Subject: [PATCH 27/38] feat(e2e-harness): bind wizard-ci as committed MCP
tools so an agent drives in one session
Register wizard-ci in .mcp.json so its tools are bound in every session in this
repo. An agent following the exploring-the-wizard skill now drives the wizard over
MCP (open_app -> read_state -> perform_action -> render_screen -> run_agent)
without registering anything or starting a fresh session. The server boots
app-agnostic; open_app picks the app + key at call time, so the committed config
holds no secrets. Skill + README rewritten to the one-session MCP flow.
Verified: a fresh headless agent given only the skill drove the wizard with four
MCP calls and wrote zero scripts.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 146 +++----------
.mcp.json | 8 +
README.md | 26 ++-
scripts/wizard-ci-mcp.no-jest.ts | 203 +++++++++++++------
4 files changed, 195 insertions(+), 188 deletions(-)
create mode 100644 .mcp.json
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index 6f267b08..d2fcad5b 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -1,142 +1,60 @@
---
name: exploring-the-wizard
-description: Run, drive, and explore the PostHog wizard headlessly against an app — decide each screen yourself over MCP (read_state / perform_action / run_agent) and snapshot the TUI to view. Use when you want to test or explore the wizard end-to-end without a terminal.
+description: Run, drive, and explore the PostHog wizard headlessly against an app — boot it on the app and decide each screen yourself over the wizard-ci MCP tools (open_app / read_state / perform_action / run_agent), snapshotting the TUI to see what happened. Use to test or explore the wizard end-to-end.
compatibility: Designed for Claude Code working on the PostHog wizard codebase.
metadata:
author: posthog
- version: "2.2"
+ version: "3.0"
---
# Exploring the wizard as an agent
-Drive a real wizard run headlessly and decide each step yourself — read the
-current screen, commit a decision, fire the agent, snapshot the TUI. For _how_ it
-works, read [`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
+Drive a real wizard run yourself: boot it on an app, read each screen, decide, act,
+snapshot. You do this through the **`wizard-ci` MCP tools**, which are already bound
+in this repo (registered in `.mcp.json`). For _how_ it works underneath, read
+[`e2e-harness/ARCHITECTURE.md`](../../../e2e-harness/ARCHITECTURE.md).
-## 0. Ask for the key, set up
+If you don't see the `wizard-ci` tools (`open_app`, `read_state`, …), the server
+isn't approved yet — ask the user to approve `wizard-ci`, then retry.
-**First, ask the user for the path to their PostHog key file** — e.g. "What's the
-absolute path to your phx key file?" — plus the project id and region if you don't
-have them. Clone/point at the app as a **throwaway `/tmp` copy** (never a real
-fixture). Note `WIZARD_PATH` (this repo). Never print or commit the key.
+## Set up
-## 1. Drive it over MCP
+Ask the user for the absolute path to their PostHog key file — e.g. "What's the
+path to your phx key file?" — plus the project id and region if you don't have
+them. Clone or copy the target app to a **throwaway `/tmp` copy** (never a real
+fixture). Never print or commit the key.
-The `wizard-ci-mcp` server holds one live store and exposes it as tools. **MCP
-tools load at session start**, so this is two phases — register, then drive in a
-**fresh session**.
+## Drive
-**Phase 1 — register (in your current session):**
+1. **`open_app({ appDir, keyFile, projectId, region })`** — boots a live wizard on
+ the app and returns the first screen. Point `appDir` at the throwaway copy; for
+ a monorepo, the actual app dir (the one with `package.json`).
+2. **`read_state`** — current screen, run phase, secret-free session, tasks, and
+ the actions legal right now. Call after every move.
+3. **`perform_action({ action, params? })`** — commit a decision: `confirm_setup`,
+ `dismiss_outage`, `choose` (a setup question, e.g. `{ key, value }`),
+ `set_mcp_outcome`, `dismiss_slack`, `keep_skills`.
+4. **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
+5. **`run_agent`** — on the `run` screen, the **real integration** (blocks minutes).
-```bash
-claude mcp add -s project wizard-ci \
- -e APP_DIR=/tmp/ \
- -e POSTHOG_KEY_FILE= \
- -e PROJECT_ID= \
- -e POSTHOG_REGION=us \
- -- npx tsx "$WIZARD_PATH/scripts/wizard-ci-mcp.no-jest.ts"
-```
-
-Confirm it loads: `claude mcp list` shows `wizard-ci: … ✔ Connected`. Then tell the
-user to **start a fresh Claude Code session in this repo** (a new tab) — that's
-where the tools live. (`APP_DIR` is any dir, so an external repo works: clone it to
-`/tmp` and point `APP_DIR` at it.)
-
-**Phase 2 — drive (in the fresh session):** the `wizard-ci` tools are now bound.
-Walk the flow, deciding each screen:
-
-- **`read_state`** — current screen, run phase, secret-free session, tasks, the
- actions legal now. Call first and after every move.
-- **`perform_action {action, params?}`** — `confirm_setup`, `dismiss_outage`,
- `choose` (a setup question), `set_mcp_outcome`, `dismiss_slack`, `keep_skills`.
-- **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
-- **`run_agent`** — on the `run` screen, the **real integration** (blocks minutes).
+A typical walk:
```
-read_state → intro → perform_action confirm_setup
+open_app → intro → perform_action confirm_setup
read_state → health-check → perform_action dismiss_outage
read_state → run → run_agent (the real integration)
read_state → outro → perform_action dismiss_outro → … → keep_skills
```
-## 2. Drive it from a script (no fresh session needed)
-
-If you can't start a new session, drive the same `WizardCiDriver` from a script —
-`readState()` → your decision → `performAction()`, `renderFrame()` to view. Put it
-inside this repo (so `@lib`/`@e2e-harness` resolve) as `scripts/explore.no-jest.ts`,
-run `npx tsx scripts/explore.no-jest.ts`, then delete it.
-
-```ts
-import fs from 'fs';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession } from '@lib/wizard-session';
-import { Program } from '@lib/programs/program-registry';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
-import { runAgent } from '@lib/agent/agent-runner';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import { WizardRecorder } from '@e2e-harness/recorder';
-import { renderFrame } from '@e2e-harness/replay';
-
-async function main() {
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store));
- store.session = buildSession({
- installDir: process.env.APP_DIR!,
- ci: true,
- apiKey: fs.readFileSync(process.env.POSTHOG_KEY_FILE!, 'utf8').trim(),
- projectId: process.env.PROJECT_ID!,
- region: 'us',
- });
- await store.runReadyHooks();
- store.runInitHooks();
- const rec = new WizardRecorder(store, { program: 'posthog-integration' });
- rec.start();
- const driver = new WizardCiDriver(store);
- const at = () => {
- const s = driver.readState();
- console.log(s.currentScreen, s.actions.map((a) => a.id));
- };
-
- at(); // intro
- driver.performAction('confirm_setup');
- at(); // health-check
- driver.performAction('dismiss_outage');
- // setup question? -> driver.performAction('choose', { key, value })
-
- await store.getGate('intro');
- await store.getGate('health-check');
- await runAgent(posthogIntegrationConfig, store.session); // the real integration
-
- driver.performAction('dismiss_outro');
- driver.performAction('set_mcp_outcome', { outcome: 'skipped' });
- driver.performAction('dismiss_slack');
- driver.performAction('keep_skills', { kept: false });
-
- rec.stop();
- for (const f of rec.getRecording().frames) {
- console.log(`\n=== ${f.screen} ===\n` + renderFrame(f, Program.PostHogIntegration));
- }
-}
-main();
-```
-
-## 3. Or run it hands-off (scripted)
-
-To let the scripted profile decide (for apps under `wizard-workbench/apps/`):
-
-```bash
-pnpm wizard-ci --e2e # real agent, headless; writes a recording
-pnpm wizard-ci-snapshots # renders each key moment → .ans + report.html
-```
+Snapshot with `render_screen` at each key moment so you (and the user) can see what
+the wizard showed.
## Key facts
-- **State → screen.** You never navigate; you commit a decision (an action's store
- setter) and the router re-derives the active screen. Name actions, not keys.
-- **`run` is the only blocking step.** Everything else is an instant store commit;
- `run_agent` / `runAgent` is the real, billable integration.
+- **State → screen.** You never navigate; you commit a decision (an action) and the
+ router re-derives the active screen. Name actions, not keys.
+- **`run` is the only blocking step.** Everything else is an instant commit;
+ `run_agent` is the real, billable integration.
- **A green run ≠ a valid integration.** `runPhase=completed` means the flow
finished, not that the wizard understood the framework (e.g. it'll treat a Wasp
app as react-router). Read what it actually changed.
diff --git a/.mcp.json b/.mcp.json
new file mode 100644
index 00000000..6693fbb9
--- /dev/null
+++ b/.mcp.json
@@ -0,0 +1,8 @@
+{
+ "mcpServers": {
+ "wizard-ci": {
+ "command": "npx",
+ "args": ["tsx", "scripts/wizard-ci-mcp.no-jest.ts"]
+ }
+ }
+}
diff --git a/README.md b/README.md
index 1942c84f..5386d7f6 100644
--- a/README.md
+++ b/README.md
@@ -417,18 +417,24 @@ LLM calls. See the `e2e-tests/README.md` for more information.
#### Explore with an agent
-You can hand the wizard to an AI agent and have it run, drive, and explore the
-wizard itself — headlessly, snapshotting the TUI so it can see what happened. The
-how-to is the `exploring-the-wizard` skill
-(`.claude/skills/exploring-the-wizard/SKILL.md`), which an agent in this repo
-discovers automatically. Example prompt — explore against
+You can hand the wizard to an AI agent and have it drive the real flow itself —
+deciding each screen and snapshotting the TUI to see what happened. The agent
+drives through the `wizard-ci` MCP tools (`open_app` / `read_state` /
+`perform_action` / `render_screen` / `run_agent`), which are registered in this
+repo's `.mcp.json` and bound in every session here — approve `wizard-ci` the first
+time you're prompted. The how-to is the `exploring-the-wizard` skill
+(`.claude/skills/exploring-the-wizard/SKILL.md`), which an agent discovers
+automatically.
+
+Example prompt — explore against
[open-saas](https://github.com/wasp-lang/open-saas):
-> Explore the PostHog wizard against a real app. Use the `exploring-the-wizard`
-> skill — your runbook for driving the wizard headlessly and snapshotting it. Ask
-> me for my phx key file path, then clone `https://github.com/wasp-lang/open-saas`
-> into a throwaway `/tmp` copy, run the wizard against it driving the flow and
-> snapshotting each key moment, and tell me what it did and anything that broke.
+> Explore the PostHog wizard against open-saas, following the
+> `exploring-the-wizard` skill. Ask me for my phx key file path, clone
+> `https://github.com/wasp-lang/open-saas` into a throwaway `/tmp` copy, then use
+> the `wizard-ci` MCP tools to open it and drive the whole flow — deciding each
+> screen yourself and snapshotting key moments — and tell me what it did and
+> anything that broke.
## Publishing your tool
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 41555e4d..1b88eca3 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -1,16 +1,15 @@
/**
- * wizard-ci-mcp — a standalone stdio MCP server that holds ONE live WizardStore
- * and exposes it, so an external agent (Claude Code, in a session where this is
- * registered) drives a real wizard run turn by turn: read_state → perform_action
- * → … → run_agent → … → keep_skills, rendering the screen whenever it wants.
+ * wizard-ci-mcp — a stdio MCP server that holds one live WizardStore and exposes
+ * it as tools, so an agent drives a real wizard run turn by turn: open_app →
+ * read_state → perform_action → … → run_agent → … → keep_skills, rendering the
+ * screen whenever it wants.
*
- * APP_DIR=/tmp/app POSTHOG_KEY_FILE=/path/to/phx.txt PROJECT_ID=… \
- * npx tsx scripts/wizard-ci-mcp.no-jest.ts # speaks MCP on stdio
+ * Registered in this repo's `.mcp.json`, so the tools are bound in every session
+ * here — no per-run setup. It boots app-agnostic; `open_app` picks the app +
+ * credentials at call time (so nothing secret lives in `.mcp.json`). It also
+ * auto-opens from APP_DIR / POSTHOG_KEY_FILE env if those happen to be set.
*
- * The transport connects immediately; the store (framework detection + the
- * networked health probe) is built lazily on the first tool call, so the MCP
- * handshake never blocks. Tools: read_state, perform_action, render_screen,
- * run_agent. stdout is the JSON-RPC channel — diagnostics go to stderr only.
+ * stdout is the JSON-RPC channel — diagnostics go to stderr only.
*/
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
@@ -36,6 +35,16 @@ const text = (data: unknown) => ({
],
});
+const errorOut = (e: unknown) => ({
+ content: [
+ {
+ type: 'text' as const,
+ text: `Error: ${e instanceof Error ? e.message : String(e)}`,
+ },
+ ],
+ isError: true,
+});
+
/** Render a store's current screen to ANSI (access token redacted). */
function renderNow(store: WizardStore): string {
const s = store.session;
@@ -67,50 +76,112 @@ function renderNow(store: WizardStore): string {
return renderFrame(frame, Program.PostHogIntegration);
}
-async function main() {
- // Fast, offline validation — fail clearly before connecting if misconfigured.
- const apiKey = (
+type Live = { store: WizardStore; driver: WizardCiDriver };
+let active: Live | null = null;
+
+/** Boot a fresh live wizard on an app and make it the active run. */
+async function openApp(cfg: {
+ appDir: string;
+ apiKey: string;
+ projectId: string;
+ region: string;
+}): Promise {
+ if (!cfg.appDir || !fs.existsSync(cfg.appDir))
+ throw new Error(`appDir missing or not found: ${cfg.appDir}`);
+ if (!cfg.apiKey)
+ throw new Error('a PostHog key is required (keyFile or apiKey)');
+ const store = new WizardStore(Program.PostHogIntegration);
+ setUI(new InkUI(store)); // real UI, never rendered → no stdout
+ store.session = buildSession({
+ installDir: cfg.appDir,
+ ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
+ apiKey: cfg.apiKey,
+ projectId: cfg.projectId,
+ region: cfg.region,
+ });
+ await store.runReadyHooks(); // framework detection
+ store.runInitHooks(); // health-check readiness probe
+ active = { store, driver: new WizardCiDriver(store) };
+ return active;
+}
+
+/** The active run, auto-opening from env if it was provided at launch. */
+async function ensure(): Promise {
+ if (active) return active;
+ const envKey = (
process.env.POSTHOG_PERSONAL_API_KEY ??
(process.env.POSTHOG_KEY_FILE
? fs.readFileSync(process.env.POSTHOG_KEY_FILE, 'utf8')
: '')
).trim();
- const appDir = process.env.APP_DIR ?? '';
- const projectId =
- process.env.PROJECT_ID ?? process.env.POSTHOG_WIZARD_PROJECT_ID ?? '';
- const region = process.env.POSTHOG_REGION ?? 'us';
- if (!apiKey)
- throw new Error('POSTHOG_PERSONAL_API_KEY or POSTHOG_KEY_FILE required');
- if (!appDir || !fs.existsSync(appDir))
- throw new Error(`APP_DIR missing or not found: ${appDir}`);
-
- // Build the live store lazily on first use: detection + the networked health
- // probe are slow, and doing them before connect() would stall the handshake.
- let ready: Promise<{ store: WizardStore; driver: WizardCiDriver }> | null =
- null;
- const live = () =>
- (ready ??= (async () => {
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store)); // never rendered → no stdout
- store.session = buildSession({
- installDir: appDir,
- ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
- apiKey,
- projectId,
- region,
- });
- await store.runReadyHooks(); // framework detection
- store.runInitHooks(); // health-check readiness probe
- return { store, driver: new WizardCiDriver(store) };
- })());
+ if (process.env.APP_DIR && envKey)
+ return openApp({
+ appDir: process.env.APP_DIR,
+ apiKey: envKey,
+ projectId:
+ process.env.PROJECT_ID ?? process.env.POSTHOG_WIZARD_PROJECT_ID ?? '',
+ region: process.env.POSTHOG_REGION ?? 'us',
+ });
+ throw new Error(
+ 'No app open. Call open_app({ appDir, keyFile, projectId, region }) first.',
+ );
+}
+async function main() {
const server = new McpServer({ name: 'wizard-ci', version: '1.0.0' });
+ server.tool(
+ 'open_app',
+ 'Boot a live wizard run on an app and make it active. Call once before the other tools. Point appDir at a throwaway copy of the app (for a monorepo, the actual app dir with package.json). Returns the first screen.',
+ {
+ appDir: z
+ .string()
+ .describe('Absolute path to the app (a throwaway /tmp copy)'),
+ keyFile: z
+ .string()
+ .optional()
+ .describe(
+ 'Absolute path to a file holding the PostHog phx key (preferred)',
+ ),
+ apiKey: z
+ .string()
+ .optional()
+ .describe('The phx key inline (prefer keyFile to keep it out of logs)'),
+ projectId: z.string().describe('PostHog project id the key is scoped to'),
+ region: z
+ .enum(['us', 'eu'])
+ .optional()
+ .describe('PostHog region (default us)'),
+ },
+ async ({ appDir, keyFile, apiKey, projectId, region }) => {
+ try {
+ const key = (
+ apiKey ?? (keyFile ? fs.readFileSync(keyFile, 'utf8') : '')
+ ).trim();
+ const live = await openApp({
+ appDir,
+ apiKey: key,
+ projectId,
+ region: region ?? 'us',
+ });
+ return text(live.driver.readState());
+ } catch (e) {
+ return errorOut(e);
+ }
+ },
+ );
+
server.tool(
'read_state',
- "Read the wizard's committed state: current screen, run phase, a secret-free session view, agent tasks/status, any pending question, unresolved setup questions, and the actions legal right now. Call first and after every perform_action.",
+ "Read the wizard's committed state: current screen, run phase, a secret-free session view, agent tasks/status, any pending question, unresolved setup questions, and the actions legal right now. Call after every perform_action.",
{},
- async () => text((await live()).driver.readState()),
+ async () => {
+ try {
+ return text((await ensure()).driver.readState());
+ } catch (e) {
+ return errorOut(e);
+ }
+ },
);
server.tool(
@@ -121,21 +192,15 @@ async function main() {
params: z
.record(z.string(), z.unknown())
.optional()
- .describe('Action params, e.g. { key: "router", value: "app" }'),
+ .describe('Action params, e.g. { key: "router", value: "app-router" }'),
},
async ({ action, params }) => {
try {
- return text((await live()).driver.performAction(action, params ?? {}));
+ return text(
+ (await ensure()).driver.performAction(action, params ?? {}),
+ );
} catch (e) {
- return {
- content: [
- {
- type: 'text' as const,
- text: `Error: ${e instanceof Error ? e.message : String(e)}`,
- },
- ],
- isError: true,
- };
+ return errorOut(e);
}
},
);
@@ -144,7 +209,13 @@ async function main() {
'render_screen',
'Render the current TUI screen to ANSI so you can see exactly what the user would.',
{},
- async () => text(renderNow((await live()).store)),
+ async () => {
+ try {
+ return text(renderNow((await ensure()).store));
+ } catch (e) {
+ return errorOut(e);
+ }
+ },
);
server.tool(
@@ -152,19 +223,23 @@ async function main() {
"Run the real wizard integration agent — the `run` screen's work. Blocks until it finishes (minutes), then returns the final runPhase and next screen. Call when read_state shows currentScreen=run.",
{},
async () => {
- const { store } = await live();
- await store.getGate('intro');
- await store.getGate('health-check');
- await runAgent(posthogIntegrationConfig, store.session);
- return text({
- runPhase: store.session.runPhase,
- currentScreen: store.currentScreen,
- });
+ try {
+ const { store } = await ensure();
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ await runAgent(posthogIntegrationConfig, store.session);
+ return text({
+ runPhase: store.session.runPhase,
+ currentScreen: store.currentScreen,
+ });
+ } catch (e) {
+ return errorOut(e);
+ }
},
);
await server.connect(new StdioServerTransport());
- process.stderr.write(`wizard-ci-mcp: ready on stdio (app=${appDir})\n`);
+ process.stderr.write('wizard-ci-mcp: ready on stdio\n');
}
main().catch((e) => {
From d50da8d455a863b5df13572fb7e3c944256cc9ea Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 19:49:54 -0400
Subject: [PATCH 28/38] docs(e2e-harness): drop "monorepo" wording from
open_app guidance
Just say to point appDir at the directory that has the package.json.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 4 ++--
scripts/wizard-ci-mcp.no-jest.ts | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index d2fcad5b..588dc342 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -27,8 +27,8 @@ fixture). Never print or commit the key.
## Drive
1. **`open_app({ appDir, keyFile, projectId, region })`** — boots a live wizard on
- the app and returns the first screen. Point `appDir` at the throwaway copy; for
- a monorepo, the actual app dir (the one with `package.json`).
+ the app and returns the first screen. Point `appDir` at the throwaway copy — the
+ app directory that has the `package.json`.
2. **`read_state`** — current screen, run phase, secret-free session, tasks, and
the actions legal right now. Call after every move.
3. **`perform_action({ action, params? })`** — commit a decision: `confirm_setup`,
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 1b88eca3..5ad89aa4 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -132,7 +132,7 @@ async function main() {
server.tool(
'open_app',
- 'Boot a live wizard run on an app and make it active. Call once before the other tools. Point appDir at a throwaway copy of the app (for a monorepo, the actual app dir with package.json). Returns the first screen.',
+ 'Boot a live wizard run on an app and make it active. Call once before the other tools. Point appDir at a throwaway copy of the app — the directory that has the package.json. Returns the first screen.',
{
appDir: z
.string()
From 332d9d67e7d9c3436c0643f9d5118bb752b389b5 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 19:51:43 -0400
Subject: [PATCH 29/38] docs(e2e-harness): drop the app-dir hand-holding from
open_app
appDir is just the throwaway copy of the app; let the agent find the path.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 3 +--
scripts/wizard-ci-mcp.no-jest.ts | 2 +-
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index 588dc342..5a72b374 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -27,8 +27,7 @@ fixture). Never print or commit the key.
## Drive
1. **`open_app({ appDir, keyFile, projectId, region })`** — boots a live wizard on
- the app and returns the first screen. Point `appDir` at the throwaway copy — the
- app directory that has the `package.json`.
+ the app and returns the first screen. `appDir` is the throwaway copy.
2. **`read_state`** — current screen, run phase, secret-free session, tasks, and
the actions legal right now. Call after every move.
3. **`perform_action({ action, params? })`** — commit a decision: `confirm_setup`,
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 5ad89aa4..7466e483 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -132,7 +132,7 @@ async function main() {
server.tool(
'open_app',
- 'Boot a live wizard run on an app and make it active. Call once before the other tools. Point appDir at a throwaway copy of the app — the directory that has the package.json. Returns the first screen.',
+ 'Boot a live wizard run on an app and make it active. Call once before the other tools. appDir is a throwaway copy of the app to integrate. Returns the first screen.',
{
appDir: z
.string()
From 6dde71072ea65fb6d08dd82e1d48660d7f98e573 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 20:00:06 -0400
Subject: [PATCH 30/38] fix(e2e-harness): point agents to run_agent on the auth
screen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
auth (and run) are NO_ACTION screens: session.credentials is set only inside
bootstrapProgram, which runs via run_agent. So nothing advances past auth without
run_agent — but the tool description said "call when currentScreen=run" and the
skill walk skipped auth, so an agent landed on auth and polled instead of calling
run_agent. Fix the run_agent description and the skill walk/key-facts to say
run_agent bootstraps creds and advances auth+run; don't poll those screens.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 10 ++++++----
scripts/wizard-ci-mcp.no-jest.ts | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index 5a72b374..4b4e7cbe 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -34,14 +34,15 @@ fixture). Never print or commit the key.
`dismiss_outage`, `choose` (a setup question, e.g. `{ key, value }`),
`set_mcp_outcome`, `dismiss_slack`, `keep_skills`.
4. **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
-5. **`run_agent`** — on the `run` screen, the **real integration** (blocks minutes).
+5. **`run_agent`** — bootstraps credentials from the key, then runs the **real
+ integration** (blocks minutes). This is what advances `auth` and `run`.
A typical walk:
```
open_app → intro → perform_action confirm_setup
read_state → health-check → perform_action dismiss_outage
-read_state → run → run_agent (the real integration)
+read_state → auth → run_agent (bootstraps creds + runs the integration)
read_state → outro → perform_action dismiss_outro → … → keep_skills
```
@@ -52,8 +53,9 @@ the wizard showed.
- **State → screen.** You never navigate; you commit a decision (an action) and the
router re-derives the active screen. Name actions, not keys.
-- **`run` is the only blocking step.** Everything else is an instant commit;
- `run_agent` is the real, billable integration.
+- **`auth` and `run` advance only via `run_agent`.** They expose no action and
+ don't self-advance — don't poll them. Everything else is an instant commit;
+ `run_agent` is the real, billable, blocking step.
- **A green run ≠ a valid integration.** `runPhase=completed` means the flow
finished, not that the wizard understood the framework (e.g. it'll treat a Wasp
app as react-router). Read what it actually changed.
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 7466e483..53adc0ec 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -220,7 +220,7 @@ async function main() {
server.tool(
'run_agent',
- "Run the real wizard integration agent — the `run` screen's work. Blocks until it finishes (minutes), then returns the final runPhase and next screen. Call when read_state shows currentScreen=run.",
+ 'Run the real wizard integration agent. It bootstraps credentials from the supplied key, then runs the integration, returning the final runPhase and next screen. This is what advances the auth and run screens — they expose no action and never advance on their own, so call this once setup is confirmed. Blocks minutes.',
{},
async () => {
try {
From 51b9f4ccdfb9ebccd66de99edd837167d0d2ee6c Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 20:27:21 -0400
Subject: [PATCH 31/38] fix(e2e-harness): make run_agent non-blocking so the
MCP server survives the run
A real run_agent call blocked the stdio MCP server for ~3 minutes; the client
treated the server as unhealthy, reconnected, and the restarted process lost its
in-memory store ("No app open", runPhase reset to idle). run_agent now starts the
integration in the background and returns immediately; read_state stays responsive
and reports runPhase running -> completed plus an integration status, so the agent
polls instead of blocking. Skill + tool descriptions updated to the poll model;
noted that run_agent creates real PostHog resources each run.
Proven: run_agent returns in 0.0s; read_state during the run answers in 1-2ms with
runPhase=running.
Co-Authored-By: Claude Opus 4.8
---
.claude/skills/exploring-the-wizard/SKILL.md | 18 ++++---
scripts/wizard-ci-mcp.no-jest.ts | 54 +++++++++++++++++---
2 files changed, 60 insertions(+), 12 deletions(-)
diff --git a/.claude/skills/exploring-the-wizard/SKILL.md b/.claude/skills/exploring-the-wizard/SKILL.md
index 4b4e7cbe..447c15a9 100644
--- a/.claude/skills/exploring-the-wizard/SKILL.md
+++ b/.claude/skills/exploring-the-wizard/SKILL.md
@@ -34,16 +34,19 @@ fixture). Never print or commit the key.
`dismiss_outage`, `choose` (a setup question, e.g. `{ key, value }`),
`set_mcp_outcome`, `dismiss_slack`, `keep_skills`.
4. **`render_screen`** — render the current TUI to ANSI so you can _see_ it.
-5. **`run_agent`** — bootstraps credentials from the key, then runs the **real
- integration** (blocks minutes). This is what advances `auth` and `run`.
+5. **`run_agent`** — kicks off the **real integration** in the background and
+ returns immediately; it bootstraps credentials, so it's what advances `auth`
+ and `run`. Then **poll `read_state`** — `runPhase` goes `running → completed`
+ and the screen advances to `outro`.
A typical walk:
```
open_app → intro → perform_action confirm_setup
read_state → health-check → perform_action dismiss_outage
-read_state → auth → run_agent (bootstraps creds + runs the integration)
-read_state → outro → perform_action dismiss_outro → … → keep_skills
+read_state → auth → run_agent (returns at once; integration runs in background)
+read_state (poll) → runPhase running → completed, screen → outro
+outro → perform_action dismiss_outro → … → keep_skills
```
Snapshot with `render_screen` at each key moment so you (and the user) can see what
@@ -54,8 +57,11 @@ the wizard showed.
- **State → screen.** You never navigate; you commit a decision (an action) and the
router re-derives the active screen. Name actions, not keys.
- **`auth` and `run` advance only via `run_agent`.** They expose no action and
- don't self-advance — don't poll them. Everything else is an instant commit;
- `run_agent` is the real, billable, blocking step.
+ don't self-advance. `run_agent` returns immediately and runs the integration in
+ the background — poll `read_state` for `runPhase` (`running → completed`).
+ Everything else is an instant commit.
+- **`run_agent` creates real PostHog resources** (a dashboard + insights) in the
+ project; each run duplicates them.
- **A green run ≠ a valid integration.** `runPhase=completed` means the flow
finished, not that the wizard understood the framework (e.g. it'll treat a Wasp
app as react-router). Read what it actually changed.
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 53adc0ec..4d2cd989 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -18,7 +18,11 @@ import fs from 'fs';
import { WizardStore } from '@ui/tui/store';
import { InkUI } from '@ui/tui/ink-ui';
import { setUI } from '@ui/index';
-import { buildSession, type WizardSession } from '@lib/wizard-session';
+import {
+ buildSession,
+ RunPhase,
+ type WizardSession,
+} from '@lib/wizard-session';
import { Program } from '@lib/programs/program-registry';
import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
import { runAgent } from '@lib/agent/agent-runner';
@@ -79,6 +83,11 @@ function renderNow(store: WizardStore): string {
type Live = { store: WizardStore; driver: WizardCiDriver };
let active: Live | null = null;
+// run_agent runs in the background so the tool returns immediately; a multi-minute
+// blocking MCP call makes the client reconnect and lose this process's store.
+let runStatus: 'idle' | 'running' | 'done' | 'failed' = 'idle';
+let runError: string | null = null;
+
/** Boot a fresh live wizard on an app and make it the active run. */
async function openApp(cfg: {
appDir: string;
@@ -177,7 +186,11 @@ async function main() {
{},
async () => {
try {
- return text((await ensure()).driver.readState());
+ const state = (await ensure()).driver.readState();
+ const extra: Record = {};
+ if (runStatus !== 'idle') extra.integration = runStatus;
+ if (runError) extra.runError = runError;
+ return text({ ...state, ...extra });
} catch (e) {
return errorOut(e);
}
@@ -220,15 +233,44 @@ async function main() {
server.tool(
'run_agent',
- 'Run the real wizard integration agent. It bootstraps credentials from the supplied key, then runs the integration, returning the final runPhase and next screen. This is what advances the auth and run screens — they expose no action and never advance on their own, so call this once setup is confirmed. Blocks minutes.',
+ 'Kick off the real wizard integration and return immediately — do NOT expect it to block. It bootstraps credentials from the key and runs the integration in the background; this is what advances the auth and run screens (they never advance on their own). Then POLL read_state every ~10s: runPhase goes running → completed and currentScreen advances to outro. Takes minutes, and creates real PostHog resources (a dashboard + insights) in the project. Call once setup is confirmed.',
{},
async () => {
try {
const { store } = await ensure();
- await store.getGate('intro');
- await store.getGate('health-check');
- await runAgent(posthogIntegrationConfig, store.session);
+ if (runStatus === 'running')
+ return text({
+ status: 'already running — poll read_state',
+ runPhase: store.session.runPhase,
+ currentScreen: store.currentScreen,
+ });
+ if (
+ runStatus === 'done' ||
+ store.session.runPhase === RunPhase.Completed
+ )
+ return text({
+ status: 'already completed',
+ runPhase: store.session.runPhase,
+ currentScreen: store.currentScreen,
+ });
+ runStatus = 'running';
+ runError = null;
+ // Background: returning now keeps the server responsive so the client
+ // doesn't reconnect (which would drop this store). The agent polls.
+ void (async () => {
+ try {
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ await runAgent(posthogIntegrationConfig, store.session);
+ runStatus = 'done';
+ } catch (e) {
+ runStatus = 'failed';
+ runError = e instanceof Error ? e.message : String(e);
+ }
+ })();
return text({
+ status:
+ 'integration started in the background — poll read_state every ~10s; runPhase goes running → completed and currentScreen advances to outro',
runPhase: store.session.runPhase,
currentScreen: store.currentScreen,
});
From 94ac289f658b4bf59b1d5da44fbff98380c55189 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 21:58:07 -0400
Subject: [PATCH 32/38] refactor(e2e-harness): drive + snapshot the real TUI;
one primitive for both routes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Both e2e routes run the real wizard TUI (startTUI) driven by store state
manipulation — no keystrokes — and capture the real rendered screen from a PTY.
Auth is satisfied by setCredentials with the phx key (same bearer as an OAuth
token), so the TUI advances with no browser.
- e2e-harness/tui-capture.ts — run a command in a PTY (node-pty), read its screen
via @xterm/headless.
- scripts/tui-host.no-jest.ts — the real-TUI host. MODE=fixed self-drives the
fixed e2e profile, signals each screen, writes a structured result JSON;
MODE=serve takes drive commands over a unix socket.
- scripts/tui-snapshots.no-jest.ts — CI route: real-TUI text snapshot per screen.
- scripts/wizard-ci-mcp.no-jest.ts — agent route: MCP server proxying the host.
- scripts/wizard-ci-explore.no-jest.ts — drive the MCP route, print the real TUI.
- scripts/tui-replay.no-jest.ts — replay captured snapshots in the terminal.
Deletes the record-then-reconstruct machinery (recorder, replay, e2e-full-run,
render-snapshots, replay-e2e) and the in-process wizard-ci-tools server. Adds
node-pty + @xterm/headless.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/ARCHITECTURE.md | 166 +++++----------
e2e-harness/__tests__/recorder.test.ts | 103 ---------
e2e-harness/action-registry.ts | 6 +-
e2e-harness/recorder.ts | 159 --------------
e2e-harness/replay.ts | 74 -------
e2e-harness/tui-capture.ts | 102 +++++++++
e2e-harness/wizard-ci-driver.ts | 4 +-
e2e-harness/wizard-ci-tools.ts | 108 ----------
package.json | 10 +-
pnpm-lock.yaml | 25 ++-
scripts/README.md | 32 +--
scripts/e2e-full-run.no-jest.ts | 280 -------------------------
scripts/render-snapshots.no-jest.ts | 32 ---
scripts/replay-e2e.no-jest.ts | 71 -------
scripts/tui-host.no-jest.ts | 257 +++++++++++++++++++++++
scripts/tui-replay.no-jest.ts | 59 ++++++
scripts/tui-snapshots.no-jest.ts | 60 ++++++
scripts/wizard-ci-explore.no-jest.ts | 105 ++++++++++
scripts/wizard-ci-mcp.no-jest.ts | 267 ++++++++++-------------
19 files changed, 800 insertions(+), 1120 deletions(-)
delete mode 100644 e2e-harness/__tests__/recorder.test.ts
delete mode 100644 e2e-harness/recorder.ts
delete mode 100644 e2e-harness/replay.ts
create mode 100644 e2e-harness/tui-capture.ts
delete mode 100644 e2e-harness/wizard-ci-tools.ts
delete mode 100644 scripts/e2e-full-run.no-jest.ts
delete mode 100644 scripts/render-snapshots.no-jest.ts
delete mode 100644 scripts/replay-e2e.no-jest.ts
create mode 100644 scripts/tui-host.no-jest.ts
create mode 100644 scripts/tui-replay.no-jest.ts
create mode 100644 scripts/tui-snapshots.no-jest.ts
create mode 100644 scripts/wizard-ci-explore.no-jest.ts
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index e13ec20e..0cf9c2e5 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -1,137 +1,87 @@
-# ci-driver — Headless e2e Control Plane
+# e2e-harness — Headless e2e Control Plane
-How an agent (or a script) drives a **real** wizard run end-to-end with no
-terminal and no browser, and asserts it worked. This is the control-plane path:
-it runs the WHOLE interactive flow headlessly via `wizard-ci-tools` and asserts
-on structured state — not the classic `--ci` mode (LoggingUI, stdout-grep,
-agent-only).
+How an agent (or CI) drives a **real** wizard run end-to-end — the **real TUI**,
+no browser, no keystrokes — and captures what it rendered. Both e2e routes share
+one idea: run the real `startTUI` (the real ink render) and drive its store by
+**state manipulation**, then capture the real rendered screen from a PTY.
-> If you're an agent that just wants to **run and explore the wizard** (drive
-> it, view the screens, snapshot it), use the `exploring-the-wizard` skill
+> If you're an agent that just wants to run and explore the wizard, use the
+> `exploring-the-wizard` skill
> ([`.claude/skills/exploring-the-wizard/SKILL.md`](../.claude/skills/exploring-the-wizard/SKILL.md)).
> This doc is the _how it works_ underneath.
## The pieces
-This whole harness lives in `e2e-harness/` at the repo root — deliberately OUT
-of `src/` so none of it is part of the wizard's production source (nothing in
-`src/` imports it; the tsdown bundle never includes it).
+This whole harness lives in `e2e-harness/` at the repo root — deliberately OUT of
+`src/` so none of it is part of the wizard's production source (nothing in `src/`
+imports it; the tsdown bundle never includes it).
```
e2e-harness/
- wizard-ci-driver.ts WizardCiDriver — read_state / list_actions / perform_action
+ wizard-ci-driver.ts WizardCiDriver — read_state / perform_action over the store
action-registry.ts screen → the actions legal on it (+ NO_ACTION_SCREENS)
- wizard-ci-tools.ts in-process MCP server exposing the driver to an external loop
e2e-profile.ts WizardE2eProfile + decideE2eAction — the scripted walk policy
profiles.ts per-program profiles + profileFor(programId)
- recorder.ts captures a run as key-moment frames
- replay.ts reconstructs a frame's store and renders the real Ink screen
+ tui-capture.ts run a command in a PTY (node-pty) + read its real screen (@xterm/headless)
+scripts/
+ tui-host.no-jest.ts the real-TUI host: startTUI + WizardCiDriver, MODE=fixed | serve
+ tui-snapshots.no-jest.ts CI route: host(fixed) in a PTY → per-screen real-TUI snapshots
+ wizard-ci-mcp.no-jest.ts agent route: MCP server proxying host(serve)
```
-The driver reads and mutates the **real** `WizardStore`: the router resolves the
-active screen from session state, every action goes through a store setter, and
-the render is a pure projection of that state. So driving the store headlessly
-exercises exactly the code an interactive run would.
+The driver reads and mutates the **real** `WizardStore` that the TUI renders from:
+the router resolves the active screen from session state, every action goes
+through a store setter, and the render is a pure projection of that state. So
+manipulating the store makes the real TUI react — the driver and the renderer
+share one store and never conflict; you never touch the TUI's input.
-## Driving a run
+## Auth without a browser
-A headless run wires a real `WizardStore` + `InkUI` (never rendered), a
-concurrent `WizardCiDriver`, and the real `runAgent` against the gateway. The
-loop is:
+The real TUI runs `ci: true`, and auth is satisfied by **state manipulation**:
+`getOrAskForProjectData({ ci: true, apiKey })` resolves the phx personal key into
+credentials, and `store.setCredentials(...)` sets them — the same bearer path an
+OAuth token takes, so the auth screen advances with no browser and no keystrokes.
+(`run_agent` does the same bootstrap as part of the real integration.)
-```
-read_state → decideE2eAction(state, profile) → perform_action → repeat
-```
-
-`scripts/e2e-full-run.no-jest.ts` is the runnable harness; the
-[wizard-workbench](https://github.com/PostHog/wizard-workbench)
-`wizard-ci --e2e` command orchestrates it (copies the app to a scratch dir,
-strips the host env, asserts on the result). Run shape:
+## The two routes
-```bash
-POSTHOG_PERSONAL_API_KEY=… POSTHOG_REGION=us \
- npx tsx scripts/e2e-full-run.no-jest.ts # APP_DIR, PROJECT_ID via env
-```
+- **CI snapshots** — `tui-snapshots.no-jest.ts` spawns `tui-host` (`MODE=fixed`)
+ in a PTY. The host self-drives the fixed profile (`decideE2eAction`) and signals
+ each new screen; the parent writes the real rendered screen to
+ `SNAP_OUT/NN-.txt`. `RUN_AGENT=1` runs the real agent through to outro.
+- **Agent** — `wizard-ci-mcp.no-jest.ts` is a stdio MCP server that spawns
+ `tui-host` (`MODE=serve`) and proxies: `read_state` / `perform_action` /
+ `run_agent` forward over a unix socket; `render_screen` returns the real
+ captured frame. The agent decides each screen itself.
-### Four things that bite
+## Things that bite
1. **Running inside an agent session.** Host env (`CLAUDECODE`, `ANTHROPIC_*`,
`CLAUDE_CODE_*`) makes the wizard's spawned agent defer auth to the host →
- `apiKeySource: none` → 401. The harness strips these for the child; if you
- invoke it directly, strip them yourself. A plain CI shell never has them.
-2. **A project-scoped key needs its project id.** A personal key scoped to one
- team must be given that team's `--project-id` (or
- `POSTHOG_WIZARD_PROJECT_ID`), or bootstrap 403s on project-data fetch. The
- key still authenticates — it just isn't scoped to the default team.
-3. **Never run on a real fixture.** Always a throwaway copy; the harness does
- this.
-4. **Runs are sequential and minutes long** (~3–8 min, gateway round-trips
- dominate). The agent log is one shared file — never run two at once.
-
-## Reading the result
-
-The harness emits a JSON result; assert on:
-
-| field | pass when |
-| --------------------------- | ------------------------------------------------ |
-| `runPhase` | `"completed"` (the agent finished) |
-| `hasPosthogDep` / `envFile` | a posthog dep was added and/or a `.env*` written |
-| `screenPath` | includes `keep-skills` (full flow walked) |
-| `skillsComplete` | `true` (run reached its done-signal) |
-| `skillsDeleted` | `true` when policy = delete |
+ `apiKeySource: none` → 401. The harness strips these for the child. A plain CI
+ shell never has them.
+2. **A project-scoped key needs its project id.** Pass the team's `--project-id`
+ (or `POSTHOG_WIZARD_PROJECT_ID`), or bootstrap 403s on project-data fetch.
+3. **Never run on a real fixture.** Always a throwaway copy.
+4. **`run_agent` is minutes long and creates real resources** (a dashboard +
+ insights) each run; the agent log is one shared file — never run two at once.
+5. **node-pty's spawn-helper.** When the package is extracted without running its
+ build script (pnpm skips it), the prebuilt `spawn-helper` loses its execute
+ bit and `pty.spawn` fails with `posix_spawnp failed`. `tui-capture.ts` restores
+ it best-effort on each spawn.
## Changing what the run does
-The per-program UI choices are product knowledge, but they live in the harness
-(`profiles.ts`, keyed by program id) — not on the program config — so this
-machinery stays out of the wizard's production source. Edit the program's entry
-in `profiles.ts` (typed by `WizardE2eProfile`). The harness asks
-`decideE2eAction(state, profile)` what to commit on each screen. To make another
-program e2e-drivable, add its profile to `profiles.ts`.
-
-The flow is **snapshot-tested** offline (no agent, deterministic):
-`__tests__/e2e-flow-snapshot.test.ts` golden-checks the (screen → decision)
-trace. Update with `jest -u` after an intentional flow/profile change.
-
-## Record & replay
-
-Every run is recorded as a timeline of **key-moment frames** — one each time the
-store/router changes (a route, a task-list update, a status line, a runPhase
-change, an overlay). Replay reconstructs each frame's store and renders the real
-Ink screen back to the terminal, so a run can be watched back to verify it:
-
-A real `--e2e` run drops a recording at `/tmp/wizard-e2e-.recording.json`.
-
-```bash
-npx tsx scripts/replay-e2e.no-jest.ts --step # Enter ▸ step
-npx tsx scripts/replay-e2e.no-jest.ts --delay 1200 # auto-play
-```
-
-An agent that can't sit in the stepper can instead read the recording JSON
-directly (each frame has `triggers`, `screen`, `tasks`, `statusMessages`,
-redacted `session`) or render specific frames to ANSI with `renderFrame()` from
-`replay.ts`. The access token is redacted, so recordings are safe to share.
-Code: `recorder.ts` (capture) + `replay.ts` (render).
+Per-program UI choices live in the harness (`profiles.ts`, keyed by program id) —
+not on the program config — so this machinery stays out of production source. Edit
+the program's entry (typed by `WizardE2eProfile`); the host asks
+`decideE2eAction(state, profile)` what to commit on each screen. The (screen →
+decision) trace is snapshot-tested offline in `__tests__/` (`jest -u` to update).
## Visual-regression snapshots (the workbench flow)
-[wizard-workbench](https://github.com/PostHog/wizard-workbench) drives this for
-real-run **visual regression**: `pnpm wizard-ci-snapshots` runs each CI-e2e test
-definition as a real `--e2e` run, renders every key-moment frame to a `.ans`
-snapshot (via `scripts/render-snapshots.no-jest.ts` → `replay.ts`), and diffs
-against a committed baseline, writing a side-by-side `report.html`. Run-to-run
-agent differences (e.g. a different task enqueued) are surfaced for a human to
-review, not asserted away. It needs `WIZARD_PATH` pointing at a checkout that
-has this `e2e-harness/`, plus the e2e env (`POSTHOG_PERSONAL_API_KEY`,
-`POSTHOG_WIZARD_PROJECT_ID`, `POSTHOG_REGION`). See
-`services/wizard-ci/snapshots.ts` there.
-
-## Driving it as a true LLM loop (optional)
-
-`wizard-ci-tools.ts` exposes `read_state` / `list_actions` / `perform_action` as
-an in-process MCP server. To have a model (not a scripted profile) play the
-user, connect a driver model and loop `read_state → reason → perform_action`.
-For deterministic CI prefer the scripted profile; reserve the LLM loop for
-fuzzing the flow. Note: a multi-turn driver must route through the wizard's real
-agent initialization for gateway auth — a bare `query()` 401s on the follow-up
-turn.
+[wizard-workbench](https://github.com/PostHog/wizard-workbench) runs the CI route
+for real-run visual regression: each test definition runs `tui-snapshots`, the
+real-TUI screens are rasterized to a side-by-side baseline-vs-current review, and
+run-to-run differences are surfaced for a human, not asserted away. See
+`services/wizard-ci/` there.
diff --git a/e2e-harness/__tests__/recorder.test.ts b/e2e-harness/__tests__/recorder.test.ts
deleted file mode 100644
index 747f7d60..00000000
--- a/e2e-harness/__tests__/recorder.test.ts
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Recorder unit test: the key-moment capture logic. (Frame *rendering* is
- * validated via tsx — jest globally mocks `ink` — see replay.ts.)
- */
-
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession, RunPhase } from '@lib/wizard-session';
-import { Integration } from '@lib/constants';
-import { FRAMEWORK_REGISTRY } from '@lib/registry';
-import { WizardReadiness } from '@lib/health-checks/readiness';
-import { Program } from '@lib/programs/program-registry';
-import { ScreenId } from '@ui/tui/router';
-import { WizardCiDriver } from '../wizard-ci-driver';
-import { decideE2eAction } from '../e2e-profile';
-import { profileFor } from '../profiles';
-import { WizardRecorder } from '../recorder';
-
-function recordedRun() {
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store));
- const session = buildSession({ installDir: '/tmp/rec', ci: true });
- session.integration = Integration.nextjs;
- session.frameworkConfig = FRAMEWORK_REGISTRY[Integration.nextjs];
- store.session = session;
-
- let clock = 0;
- const rec = new WizardRecorder(
- store,
- { program: 'posthog-integration', app: 'demo' },
- () => (clock += 500),
- );
- rec.start();
-
- const driver = new WizardCiDriver(store);
- const profile = profileFor(Program.PostHogIntegration);
- for (let i = 0; i < 40; i++) {
- const state = driver.readState();
- const d = decideE2eAction(state, profile);
- if (d.action) driver.performAction(d.action.id, d.action.params ?? {});
- if (state.currentScreen === ScreenId.HealthCheck) {
- store.setReadinessResult({
- decision: WizardReadiness.Yes,
- health: {} as never,
- reasons: [],
- });
- } else if (state.currentScreen === ScreenId.Auth) {
- store.setCredentials({
- accessToken: 'phx_topsecret',
- projectApiKey: 'phc_x',
- host: 'https://us.posthog.com',
- projectId: 1,
- });
- } else if (state.currentScreen === ScreenId.Run) {
- store.pushStatus('Installing posthog-js…');
- store.setTasks([
- { label: 'Install SDK', status: 'completed' as never, done: true },
- ]);
- store.setRunPhase(RunPhase.Completed);
- }
- if (d.done || store.session.skillsComplete) break;
- }
- rec.stop();
- return rec.getRecording();
-}
-
-describe('WizardRecorder', () => {
- it('captures a frame at each key moment, labelled by trigger', () => {
- const rec = recordedRun();
- const triggers = rec.frames.map((f) => f.triggers.join('+'));
-
- // First frame is the initial snapshot; every route lands a 'screen' frame.
- expect(triggers[0]).toBe('start');
- expect(rec.frames.map((f) => f.screen)).toEqual(
- expect.arrayContaining([
- ScreenId.Intro,
- ScreenId.HealthCheck,
- ScreenId.Setup,
- ScreenId.Auth,
- ScreenId.Run,
- ScreenId.Outro,
- ScreenId.Mcp,
- ScreenId.SlackConnect,
- ScreenId.KeepSkills,
- ]),
- );
- // Task + status updates during the run are their own key moments.
- expect(triggers).toContain('tasks');
- expect(triggers).toContain('status');
- // Frames carry monotonic timestamps.
- expect(rec.frames.map((f) => f.ms)).toEqual(
- [...rec.frames.map((f) => f.ms)].sort((a, b) => a - b),
- );
- });
-
- it('redacts the access token from the recording', () => {
- const rec = recordedRun();
- expect(JSON.stringify(rec)).not.toContain('phx_topsecret');
- const authed = rec.frames.find((f) => f.session.credentials);
- expect(authed?.session.credentials?.accessToken).toBe('phx_***redacted***');
- });
-});
diff --git a/e2e-harness/action-registry.ts b/e2e-harness/action-registry.ts
index cde6315a..13f274e4 100644
--- a/e2e-harness/action-registry.ts
+++ b/e2e-harness/action-registry.ts
@@ -3,9 +3,9 @@
*
* Maps every screen/overlay to the set of *commit* actions a user could
* perform on it — and, for each, the single WizardStore setter/resolver that
- * commit goes through. This is the actuation half of the wizard-ci-tools
- * surface: instead of injecting keystrokes, a harness names an action and the
- * driver invokes the same store method the Ink screen's keyboard handler would.
+ * commit goes through. This is the actuation half of the driver: instead of
+ * injecting keystrokes, a harness names an action and the driver invokes the
+ * same store method the Ink screen's keyboard handler would.
*
* Discipline mirrors screen-registry.tsx: one entry per screen, kept exhaustive
* by a test over the ScreenId/Overlay enums. No product knowledge leaks in —
diff --git a/e2e-harness/recorder.ts b/e2e-harness/recorder.ts
deleted file mode 100644
index 6e65c190..00000000
--- a/e2e-harness/recorder.ts
+++ /dev/null
@@ -1,159 +0,0 @@
-/**
- * WizardRecorder — records a wizard run as a timeline of frames, one per "key
- * moment", so the run can be replayed in the terminal later (by an agent or a
- * human) to verify what happened.
- *
- * Key moments are store/router changes: a route (screen) change, a runPhase
- * change, a task-list update, a new status line, an event-plan update, or an
- * overlay push/pop. The recorder subscribes to the store's single version
- * counter (the same signal React uses) and snapshots state whenever one of
- * those changes — so the recording mirrors exactly what the live TUI would have
- * repainted on.
- *
- * Each frame stores the (secret-redacted) session plus tasks/status/event-plan,
- * which is enough for {@link ../replay} to reconstruct a throwaway store and
- * render the real Ink screen back to ANSI.
- */
-
-import type { WizardStore } from '@ui/tui/store';
-import type { ScreenName } from '@ui/tui/router';
-import type { WizardSession } from '@lib/wizard-session';
-
-/** The change(s) that triggered a frame. */
-export type FrameTrigger =
- | 'start'
- | 'screen'
- | 'runPhase'
- | 'tasks'
- | 'status'
- | 'eventPlan'
- | 'overlay';
-
-export interface RecordedFrame {
- seq: number;
- /** ms since the recording started. */
- ms: number;
- /** Which key moment(s) produced this frame. */
- triggers: FrameTrigger[];
- screen: ScreenName;
- hasOverlay: boolean;
- /** Session snapshot with the access token redacted. */
- session: WizardSession;
- tasks: Array<{
- label: string;
- status: string;
- activeForm?: string;
- done: boolean;
- }>;
- statusMessages: string[];
- eventPlan: Array<{ name: string; description: string }>;
-}
-
-export interface Recording {
- meta: { program: string; app?: string; startedAtMs: number };
- frames: RecordedFrame[];
-}
-
-/** Redact the access token — recordings are shareable artifacts. */
-function redactSession(session: WizardSession): WizardSession {
- if (!session.credentials) return session;
- return {
- ...session,
- credentials: { ...session.credentials, accessToken: 'phx_***redacted***' },
- };
-}
-
-export class WizardRecorder {
- private frames: RecordedFrame[] = [];
- private seq = 0;
- private startMs: number;
- private unsub: (() => void) | null = null;
-
- private prevScreen: ScreenName;
- private prevRunPhase: WizardSession['runPhase'];
- private prevTasks: unknown;
- private prevStatus: unknown;
- private prevEventPlan: unknown;
- private prevOverlay: boolean;
-
- constructor(
- private readonly store: WizardStore,
- private readonly meta: { program: string; app?: string },
- private readonly now: () => number = () => Date.now(),
- ) {
- this.startMs = this.now();
- this.prevScreen = store.currentScreen;
- this.prevRunPhase = store.session.runPhase;
- this.prevTasks = store.tasks;
- this.prevStatus = store.statusMessages;
- this.prevEventPlan = store.eventPlan;
- this.prevOverlay = store.router.hasOverlay;
- }
-
- /** Begin recording: snapshot the initial frame and subscribe to changes. */
- start(): void {
- this.capture(['start']);
- this.unsub = this.store.subscribe(() => this.onChange());
- }
-
- /** Stop recording. */
- stop(): void {
- this.unsub?.();
- this.unsub = null;
- }
-
- private onChange(): void {
- const s = this.store;
- const triggers: FrameTrigger[] = [];
- // The store replaces these by reference on every mutation, so identity
- // comparison detects each kind of key moment.
- if (s.currentScreen !== this.prevScreen) triggers.push('screen');
- if (s.session.runPhase !== this.prevRunPhase) triggers.push('runPhase');
- if (s.tasks !== this.prevTasks) triggers.push('tasks');
- if (s.statusMessages !== this.prevStatus) triggers.push('status');
- if (s.eventPlan !== this.prevEventPlan) triggers.push('eventPlan');
- if (s.router.hasOverlay !== this.prevOverlay) triggers.push('overlay');
-
- this.prevScreen = s.currentScreen;
- this.prevRunPhase = s.session.runPhase;
- this.prevTasks = s.tasks;
- this.prevStatus = s.statusMessages;
- this.prevEventPlan = s.eventPlan;
- this.prevOverlay = s.router.hasOverlay;
-
- if (triggers.length > 0) this.capture(triggers);
- }
-
- private capture(triggers: FrameTrigger[]): void {
- this.frames.push({
- seq: this.seq++,
- ms: this.now() - this.startMs,
- triggers,
- screen: this.store.currentScreen,
- hasOverlay: this.store.router.hasOverlay,
- session: redactSession(this.store.session),
- tasks: this.store.tasks.map((t) => ({
- label: t.label,
- status: t.status,
- activeForm: t.activeForm,
- done: t.done,
- })),
- statusMessages: [...this.store.statusMessages],
- eventPlan: this.store.eventPlan.map((e) => ({
- name: e.name,
- description: e.description,
- })),
- });
- }
-
- getRecording(): Recording {
- return {
- meta: { ...this.meta, startedAtMs: this.startMs },
- frames: this.frames,
- };
- }
-
- get frameCount(): number {
- return this.frames.length;
- }
-}
diff --git a/e2e-harness/replay.ts b/e2e-harness/replay.ts
deleted file mode 100644
index d5729413..00000000
--- a/e2e-harness/replay.ts
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Replay a {@link Recording} in the terminal: for each recorded frame,
- * reconstruct a throwaway store from the frame's state and render the REAL Ink
- * screen back to ANSI — so an agent or a human sees the run play back exactly as
- * the live TUI drew it, paused at each key moment.
- *
- * Rendering is offline against a disposable store, so screen effects (detection,
- * prefetch) fire harmlessly against the recorded state and never touch the real
- * run.
- */
-
-import { readFileSync } from 'fs';
-import type { ReactElement } from 'react';
-import { render } from 'ink-testing-library';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { TaskStatus } from '@ui/wizard-ui';
-import type { ProgramId } from '@ui/tui/router';
-import { createScreens, createServices } from '@ui/tui/screen-registry';
-import type { Recording, RecordedFrame } from './recorder.js';
-
-export function loadRecording(path: string): Recording {
- return JSON.parse(readFileSync(path, 'utf8')) as Recording;
-}
-
-/**
- * Render one frame's screen to an ANSI string by rebuilding a disposable store
- * from the frame and mounting the real screen component. Falls back to a text
- * summary if a screen throws on render.
- */
-export function renderFrame(frame: RecordedFrame, program: ProgramId): string {
- const store = new WizardStore(program);
- setUI(new InkUI(store));
- store.session = frame.session;
- store.setTasks(
- frame.tasks.map((t) => ({
- label: t.label,
- activeForm: t.activeForm,
- status: (t.status as TaskStatus) ?? TaskStatus.Pending,
- done: t.done,
- })),
- );
- if (frame.eventPlan.length > 0) store.setEventPlan(frame.eventPlan);
- for (const m of frame.statusMessages) store.pushStatus(m);
-
- try {
- const services = createServices(store);
- const screens = createScreens(store, services);
- const node = screens[frame.screen];
- if (!node) return `(no component registered for screen "${frame.screen}")`;
- const { lastFrame, unmount } = render(node as ReactElement);
- const out = lastFrame() ?? '';
- unmount();
- return out;
- } catch (err) {
- return `(render failed: ${
- err instanceof Error ? err.message : String(err)
- })`;
- }
-}
-
-/** One-line header summarizing a frame. */
-export function frameHeader(frame: RecordedFrame, total: number): string {
- const secs = (frame.ms / 1000).toFixed(1);
- const tasks = frame.tasks.length
- ? ` · tasks ${frame.tasks.filter((t) => t.done).length}/${
- frame.tasks.length
- }`
- : '';
- return `── [${frame.seq + 1}/${total}] +${secs}s · ${frame.triggers.join(
- '+',
- )} · screen=${frame.screen}${tasks} ──`;
-}
diff --git a/e2e-harness/tui-capture.ts b/e2e-harness/tui-capture.ts
new file mode 100644
index 00000000..d4bbdcfa
--- /dev/null
+++ b/e2e-harness/tui-capture.ts
@@ -0,0 +1,102 @@
+/**
+ * Run a command in a PTY and read its real terminal screen.
+ *
+ * The shared capture primitive for both e2e routes: spawn the real-TUI host in a
+ * pseudo-terminal (node-pty) so it renders the real ink TUI, feed its output to a
+ * headless xterm emulator, and read the current screen as clean text on demand.
+ */
+import fsmod from 'fs';
+import pathmod from 'path';
+import * as pty from 'node-pty';
+import { createRequire } from 'module';
+
+// @xterm/headless ships CJS; its `module` field points at the full browser build,
+// so import the headless CJS entry directly to get a working Terminal in Node.
+const require = createRequire(import.meta.url);
+const { Terminal } =
+ require('@xterm/headless') as typeof import('@xterm/headless');
+
+// node-pty's prebuilt macOS/Linux spawn-helper can lose its execute bit when the
+// package is extracted without running its build script (e.g. pnpm skips it),
+// which makes pty.spawn fail with "posix_spawnp failed". Restore it, best-effort.
+function ensureSpawnHelper(): void {
+ try {
+ const root = pathmod.dirname(require.resolve('node-pty/package.json'));
+ const dir = pathmod.join(
+ root,
+ 'prebuilds',
+ `${process.platform}-${process.arch}`,
+ );
+ const helper = pathmod.join(dir, 'spawn-helper');
+ if (fsmod.existsSync(helper)) fsmod.chmodSync(helper, 0o755);
+ } catch {
+ /* best-effort */
+ }
+}
+
+export interface TuiCapture {
+ /** The current rendered screen as clean text (trailing blank lines trimmed). */
+ frame(): string;
+ /** Fires after each chunk of terminal output is applied. */
+ onData(cb: () => void): void;
+ kill(): void;
+ /** Resolves when the child exits. */
+ exited: Promise;
+}
+
+export function captureTui(opts: {
+ cmd: string;
+ args: string[];
+ cwd: string;
+ env: NodeJS.ProcessEnv;
+ cols?: number;
+ rows?: number;
+}): TuiCapture {
+ // Default to a roomy, full-screen-terminal-ish size (overridable per call or
+ // via PTY_COLS / PTY_ROWS) so the TUI renders the way it would on a real Mac
+ // terminal rather than cramped. The PTY winsize drives ink's layout.
+ const cols = opts.cols ?? (Number(process.env.PTY_COLS) || 180);
+ const rows = opts.rows ?? (Number(process.env.PTY_ROWS) || 50);
+ ensureSpawnHelper();
+ const term = new Terminal({ cols, rows, allowProposedApi: true });
+ const child = pty.spawn(opts.cmd, opts.args, {
+ name: 'xterm-256color',
+ cols,
+ rows,
+ cwd: opts.cwd,
+ env: opts.env as { [key: string]: string },
+ });
+
+ const cbs: Array<() => void> = [];
+ child.onData((d) => {
+ term.write(d);
+ for (const cb of cbs) cb();
+ });
+ let resolveExit!: () => void;
+ const exited = new Promise((r) => (resolveExit = r));
+ child.onExit(() => resolveExit());
+
+ return {
+ frame() {
+ const buf = term.buffer.active;
+ const lines: string[] = [];
+ for (let i = 0; i < rows; i++) {
+ const line = buf.getLine(i);
+ lines.push(line ? line.translateToString(true) : '');
+ }
+ while (lines.length && !lines[lines.length - 1].trim()) lines.pop();
+ return lines.join('\n') + '\n';
+ },
+ onData(cb) {
+ cbs.push(cb);
+ },
+ kill() {
+ try {
+ child.kill();
+ } catch {
+ /* already gone */
+ }
+ },
+ exited,
+ };
+}
diff --git a/e2e-harness/wizard-ci-driver.ts b/e2e-harness/wizard-ci-driver.ts
index cfa85a11..e23ce973 100644
--- a/e2e-harness/wizard-ci-driver.ts
+++ b/e2e-harness/wizard-ci-driver.ts
@@ -1,8 +1,8 @@
/**
* WizardCiDriver — the read/act control plane over a live WizardStore.
*
- * This is the SDK-free core of wizard-ci-tools. A test harness or a driver LLM
- * uses three primitives to run a real wizard end-to-end without a terminal:
+ * This is the read/act core both e2e routes drive. A test harness or a driver
+ * LLM uses these primitives to run a real wizard end-to-end without keystrokes:
*
* readState() — a truthful projection of the committed store state
* (the same state the Ink render is a pure function of),
diff --git a/e2e-harness/wizard-ci-tools.ts b/e2e-harness/wizard-ci-tools.ts
deleted file mode 100644
index f0d6db09..00000000
--- a/e2e-harness/wizard-ci-tools.ts
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * wizard-ci-tools — in-process MCP server exposing the WizardCiDriver.
- *
- * A thin SDK adapter over {@link WizardCiDriver}: three tools that let an
- * external driver (a test harness or an LLM) read the wizard's committed state
- * and commit decisions, driving a real run with no terminal.
- *
- * read_state — truthful snapshot + derived currentScreen + legal actions
- * list_actions — commit actions legal on the current screen
- * perform_action — invoke one (via the store setter the Ink screen would)
- *
- * Mirrors wizard-tools.ts: pure adapter behind a seam (the driver), importing
- * no product knowledge. The driver does the work; this just speaks MCP. The
- * SDK is dynamically imported so this module loads even where the SDK is mocked.
- */
-
-import { z } from 'zod';
-import type { WizardCiDriver } from './wizard-ci-driver.js';
-import { UnknownActionError, MissingParamError } from './wizard-ci-driver.js';
-
-let _sdkModule: unknown = null;
-async function getSDKModule(): Promise<{
- tool: (...args: unknown[]) => unknown;
- createSdkMcpServer: (opts: unknown) => unknown;
-}> {
- if (!_sdkModule) {
- _sdkModule = await import('@anthropic-ai/claude-agent-sdk');
- }
- return _sdkModule as never;
-}
-
-export const CI_TOOLS_SERVER_NAME = 'wizard-ci-tools';
-
-const ok = (data: unknown) => ({
- content: [{ type: 'text' as const, text: JSON.stringify(data, null, 2) }],
-});
-const err = (message: string) => ({
- content: [{ type: 'text' as const, text: `Error: ${message}` }],
- isError: true,
-});
-
-/** Create the wizard-ci-tools MCP server bound to a live driver. */
-export async function createWizardCiToolsServer(
- driver: WizardCiDriver,
-): Promise {
- const sdk = await getSDKModule();
- const { tool, createSdkMcpServer } = sdk;
-
- const readState = tool(
- 'read_state',
- "Read the wizard's current committed state: the active screen, run phase, " +
- 'a whitelisted view of the session, agent tasks/status/event-plan, any ' +
- 'pending wizard_ask question, unresolved setup questions, and the commit ' +
- 'actions legal right now. Call this first and after every perform_action.',
- {},
- () => ok(driver.readState()),
- );
-
- const listActions = tool(
- 'list_actions',
- 'List the commit actions legal on the current screen, with their params. ' +
- 'Each maps to the same store mutation the interactive UI would perform.',
- {},
- () =>
- ok({
- currentScreen: driver.readState().currentScreen,
- actions: driver.listActions(),
- }),
- );
-
- const performAction = tool(
- 'perform_action',
- 'Commit a decision by invoking a legal action for the current screen ' +
- '(e.g. confirm_setup, choose, answer_question, set_mcp_outcome, ' +
- 'dismiss_outro, keep_skills). Returns the next state. The action must ' +
- 'appear in read_state.actions for the current screen.',
- {
- action: z.string().describe('Action id from read_state.actions'),
- params: z
- .record(z.string(), z.unknown())
- .optional()
- .describe('Action params, e.g. { answers: { router: "app" } }'),
- },
- (args: { action: string; params?: Record }) => {
- try {
- return ok(driver.performAction(args.action, args.params ?? {}));
- } catch (e) {
- if (e instanceof UnknownActionError || e instanceof MissingParamError) {
- return err(e.message);
- }
- return err(e instanceof Error ? e.message : String(e));
- }
- },
- );
-
- return createSdkMcpServer({
- name: CI_TOOLS_SERVER_NAME,
- version: '1.0.0',
- tools: [readState, listActions, performAction],
- });
-}
-
-/** Fully-qualified MCP tool names, for allowedTools wiring. */
-export const CI_TOOL_NAMES = {
- readState: `mcp__${CI_TOOLS_SERVER_NAME}__read_state`,
- listActions: `mcp__${CI_TOOLS_SERVER_NAME}__list_actions`,
- performAction: `mcp__${CI_TOOLS_SERVER_NAME}__perform_action`,
-} as const;
diff --git a/package.json b/package.json
index 75a381f7..bcdb743b 100644
--- a/package.json
+++ b/package.json
@@ -62,6 +62,7 @@
"@babel/plugin-transform-modules-commonjs": "^7.28.6",
"@babel/preset-env": "^7.29.0",
"@babel/types": "~7.21.4",
+ "@modelcontextprotocol/sdk": "^1.29.0",
"@types/chai": "^4.3.17",
"@types/glob": "^7.2.0",
"@types/inquirer": "^0.0.43",
@@ -75,6 +76,7 @@
"@types/yargs": "^16.0.9",
"@typescript-eslint/eslint-plugin": "^5.13.0",
"@typescript-eslint/parser": "^5.13.0",
+ "@xterm/headless": "^6.0.0",
"babel-jest": "^29.7.0",
"dotenv": "^16.4.7",
"eslint": "^8.18.0",
@@ -85,14 +87,14 @@
"jest": "^29.5.0",
"lint-staged": "^15.5.1",
"msw": "^2.10.4",
+ "node-pty": "^1.1.0",
"prettier": "^2.8.7",
"rimraf": "^3.0.2",
"ts-jest": "^29.1.0",
"ts-node": "^10.9.1",
"tsdown": "^0.21.9",
"tsx": "^4.20.3",
- "typescript": "^5.0.4",
- "@modelcontextprotocol/sdk": "^1.29.0"
+ "typescript": "^5.0.4"
},
"engines": {
"node": "^20.20.0 || >=22.22.0",
@@ -121,7 +123,9 @@
"dev": "pnpm build && pnpm link --global && pnpm build:watch",
"test:watch": "jest --watch",
"prepare": "husky",
- "screens:check": "tsx scripts/check-screens.tsx"
+ "screens:check": "tsx scripts/check-screens.tsx",
+ "wizard-ci-explore": "tsx scripts/wizard-ci-explore.no-jest.ts",
+ "wizard-ci-replay": "tsx scripts/tui-replay.no-jest.ts"
},
"jest": {
"collectCoverage": true,
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e3c8aef5..d268d4c6 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -75,7 +75,7 @@ importers:
specifier: ^16.2.0
version: 16.2.0
zod:
- specifier: ^3.25.0
+ specifier: ^3.25.76
version: 3.25.76
zod-to-json-schema:
specifier: ^3.24.3
@@ -135,6 +135,9 @@ importers:
'@typescript-eslint/parser':
specifier: ^5.13.0
version: 5.62.0(eslint@8.57.1)(typescript@5.7.3)
+ '@xterm/headless':
+ specifier: ^6.0.0
+ version: 6.0.0
babel-jest:
specifier: ^29.7.0
version: 29.7.0(@babel/core@7.29.0)
@@ -165,6 +168,9 @@ importers:
msw:
specifier: ^2.10.4
version: 2.10.4(@types/node@18.19.76)(typescript@5.7.3)
+ node-pty:
+ specifier: ^1.1.0
+ version: 1.1.0
prettier:
specifier: ^2.8.7
version: 2.8.8
@@ -1642,6 +1648,9 @@ packages:
resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
engines: {node: '>=10.0.0'}
+ '@xterm/headless@6.0.0':
+ resolution: {integrity: sha512-5Yj1QINYCyzrZtf8OFIHi47iQtI+0qYFPHmouEfG8dHNxbZ9Tb9YGSuLcsEwj9Z+OL75GJqPyJbyoFer80a2Hw==}
+
accepts@2.0.0:
resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==}
engines: {node: '>= 0.6'}
@@ -3137,9 +3146,15 @@ packages:
resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==}
engines: {node: '>= 0.6'}
+ node-addon-api@7.1.1:
+ resolution: {integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==}
+
node-int64@0.4.0:
resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==}
+ node-pty@1.1.0:
+ resolution: {integrity: sha512-20JqtutY6JPXTUnL0ij1uad7Qe1baT46lyolh2sSENDd4sTzKZ4nmAFkeAARDKwmlLjPx6XKRlwRUxwjOy+lUg==}
+
node-releases@2.0.19:
resolution: {integrity: sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==}
@@ -5783,6 +5798,8 @@ snapshots:
'@xmldom/xmldom@0.8.10': {}
+ '@xterm/headless@6.0.0': {}
+
accepts@2.0.0:
dependencies:
mime-types: 3.0.2
@@ -7474,8 +7491,14 @@ snapshots:
negotiator@1.0.0: {}
+ node-addon-api@7.1.1: {}
+
node-int64@0.4.0: {}
+ node-pty@1.1.0:
+ dependencies:
+ node-addon-api: 7.1.1
+
node-releases@2.0.19: {}
node-releases@2.0.27: {}
diff --git a/scripts/README.md b/scripts/README.md
index de1d0538..0a9aacbb 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -7,23 +7,27 @@ standalone `tsx` entry, named `*.no-jest.ts` so Jest ignores it.
Run from the repo root, e.g. `npx tsx scripts/.no-jest.ts`.
-| Script | What it does | Needs |
-| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
-| **`e2e-full-run.no-jest.ts`** | The full headless e2e: real `WizardStore` + `InkUI` (never rendered) + concurrent driver + **real `runAgent`** against prod cloud. Emits a structured result + a recording. | `POSTHOG_PERSONAL_API_KEY`, `APP_DIR`, `PROJECT_ID`; host `CLAUDE_*` env stripped |
-| **`render-snapshots.no-jest.ts`** | Renders a recording's key-moment frames to per-frame `.ans` snapshots (real Ink → ANSI). Feeds the workbench visual-regression flow. | a `recording.json` + outDir |
-| **`replay-e2e.no-jest.ts`** | Replays a recording in the terminal — reconstructs each frame's store and renders the **real Ink screen**. `--step` (Enter to advance) or `--delay ` (auto-play). | a `recording.json` |
-| **`wizard-ci-mcp.no-jest.ts`** | A stdio **MCP server** over one live store: an agent drives a real run turn-by-turn (`read_state` / `perform_action` / `run_agent` / `render_screen`). See the `exploring-the-wizard` skill. | `APP_DIR`, `POSTHOG_KEY_FILE` (or key), `PROJECT_ID` |
+Both e2e routes share one primitive: the **real TUI host** runs `startTUI` (the
+real ink render) and is driven purely by store state manipulation; a PTY parent
+([`e2e-harness/tui-capture.ts`](../e2e-harness/tui-capture.ts), node-pty +
+`@xterm/headless`) captures the real rendered screen.
-> You usually don't call these directly — `pnpm wizard-ci --e2e` and
-> `pnpm wizard-ci-snapshots` (in
-> [wizard-workbench](https://github.com/PostHog/wizard-workbench)) orchestrate
-> them with the env hygiene + assertions. A real `--e2e` run drops a recording
-> at `/tmp/wizard-e2e-.recording.json`.
+| Script | What it does | Needs |
+| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
+| **`tui-host.no-jest.ts`** | The real-TUI host. `MODE=fixed` self-drives the fixed e2e profile and signals each screen; `MODE=serve` accepts drive commands (`read_state`/`perform_action`/`run_agent`) over a unix socket. | `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID`; run under a PTY |
+| **`tui-snapshots.no-jest.ts`** | CI snapshot route: spawns `tui-host` (`MODE=fixed`) in a PTY and writes the **real rendered** screen to `SNAP_OUT/NN-.txt` at each key moment. `RUN_AGENT=1` for the full run through outro. | `SNAP_OUT`, `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID` |
+| **`wizard-ci-mcp.no-jest.ts`** | Agent route: a stdio **MCP server** that proxies `tui-host` (`MODE=serve`) — `read_state`/`perform_action`/`run_agent` forward over the socket, `render_screen` returns the real captured frame. | spawns the host itself; key passed via `open_app` |
+| **`wizard-ci-explore.no-jest.ts`** | Quick eyeball of the agent route: drives the MCP server (`open_app → confirm_setup → render_screen`) and prints the real TUI. `pnpm wizard-ci-explore`. | `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID` |
+
+> You usually don't call these directly — `pnpm wizard-ci-snapshots` (in
+> [wizard-workbench](https://github.com/PostHog/wizard-workbench)) orchestrates
+> the snapshot route; the MCP server is registered in this repo's `.mcp.json` and
+> used via the `exploring-the-wizard` skill.
## Background
The control plane lives in [`e2e-harness/`](../e2e-harness/) — out of `src/`, so
none of it ships in prod. `WizardCiDriver` (read/act over the store), the
-screen→action registry, the `wizard-ci-tools` MCP server, the e2e profiles, and
-the recorder/replay. See [`ARCHITECTURE.md`](../e2e-harness/ARCHITECTURE.md) for
-how an agent drives these (env strip, scoped project id, gotchas).
+screen→action registry, the e2e profiles, and `tui-capture` (real-TUI PTY
+capture). See [`ARCHITECTURE.md`](../e2e-harness/ARCHITECTURE.md) for how the two
+routes drive these (env strip, scoped project id, gotchas).
diff --git a/scripts/e2e-full-run.no-jest.ts b/scripts/e2e-full-run.no-jest.ts
deleted file mode 100644
index 2f00d6ae..00000000
--- a/scripts/e2e-full-run.no-jest.ts
+++ /dev/null
@@ -1,280 +0,0 @@
-/**
- * Full headless e2e — runs the REAL wizard integration flow against prod cloud,
- * driven entirely by WizardCiDriver. No Ink, no browser, no LoggingUI.
- *
- * Unlike classic `--ci` (LoggingUI: runs the agent then exits, skipping the
- * intro / setup / mcp / slack / keep-skills screens and offering only
- * stdout to assert on), this runs the WHOLE interactive flow — the driver makes
- * each human-side decision through the same store setters the Ink UI would, and
- * the run is observed through structured `read_state`.
- *
- * POSTHOG_PERSONAL_API_KEY=… APP_DIR=/tmp/run-x PROJECT_ID=228144 \
- * tsx scripts/e2e-full-run.no-jest.ts
- */
-
-import fs from 'fs';
-import path from 'path';
-import { execFileSync } from 'child_process';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import { buildSession, RunPhase } from '@lib/wizard-session';
-import { Program } from '@lib/programs/program-registry';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import { runAgent } from '@lib/agent/agent-runner';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
-import type { ScreenName } from '@ui/tui/router';
-import {
- decideE2eAction,
- type WizardE2eProfile,
-} from '@e2e-harness/e2e-profile';
-import { profileFor } from '@e2e-harness/profiles';
-import { WizardRecorder } from '@e2e-harness/recorder';
-
-const log = (m: string) => process.stdout.write(`[e2e] ${m}\n`);
-
-/** Snapshot package.json deps + file list, to diff before/after. */
-function snapshot(dir: string): { deps: string[]; files: Set } {
- const files = new Set();
- const walk = (d: string, rel = '') => {
- for (const e of fs.readdirSync(d, { withFileTypes: true })) {
- if (e.name === 'node_modules' || e.name === '.git') continue;
- const r = path.join(rel, e.name);
- if (e.isDirectory()) walk(path.join(d, e.name), r);
- else files.add(r);
- }
- };
- walk(dir);
- let deps: string[] = [];
- try {
- const pkg = JSON.parse(
- fs.readFileSync(path.join(dir, 'package.json'), 'utf8'),
- );
- deps = Object.keys({ ...pkg.dependencies, ...pkg.devDependencies });
- } catch {
- /* no package.json (some frameworks) */
- }
- return { deps, files };
-}
-
-async function main() {
- const apiKey = (process.env.POSTHOG_PERSONAL_API_KEY ?? '').trim();
- const appDir = process.env.APP_DIR!;
- const projectId = process.env.PROJECT_ID ?? '228144';
- // Happy-path e2e policy: skip MCP + Slack always; KEEP vs DELETE skills is the
- // one knob (default = delete, matching `wizard-ci --e2e`). Health-check issues
- // are always dismissed so a transient outage never blocks the run.
- const keepSkills = process.env.E2E_KEEP_SKILLS === 'true';
- if (!apiKey) throw new Error('Set POSTHOG_PERSONAL_API_KEY');
- if (!appDir || !fs.existsSync(appDir))
- throw new Error(`APP_DIR missing: ${appDir}`);
-
- const before = snapshot(appDir);
- log(
- `app: ${appDir} (project ${projectId}) files=${before.files.size} deps=${before.deps.length}`,
- );
-
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store)); // real UI, never rendered
- const session = buildSession({
- installDir: appDir,
- ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
- apiKey,
- projectId, // the key's scoped project — required, else bootstrap 403s
- region: 'us',
- });
- store.session = session;
-
- // Record the run as a timeline of key-moment frames (route changes, task
- // updates, status lines, …) so it can be replayed in the terminal later.
- const recorder = new WizardRecorder(store, {
- program: 'posthog-integration',
- app: path.basename(appDir),
- });
- recorder.start();
-
- const driver = new WizardCiDriver(store);
-
- // The harness owns the per-program e2e UI choices (profileFor). It asks
- // decideE2eAction what to commit on each screen. The --keep-skills flag
- // (E2E_KEEP_SKILLS) overrides the profile's skills policy.
- const profile: WizardE2eProfile = {
- ...profileFor(Program.PostHogIntegration),
- ...(keepSkills ? { skills: 'keep' as const } : {}),
- };
- log(`e2e profile: ${JSON.stringify(profile)}`);
-
- // Concurrent driver loop: commits the profile's decision on each screen as it
- // appears, until the run signals skillsComplete.
- const seen: ScreenName[] = [];
- let stop = false;
- const driverLoop = async () => {
- while (!stop && !store.session.skillsComplete) {
- const state = driver.readState();
- const before = state.currentScreen;
- if (seen[seen.length - 1] !== before) {
- seen.push(before);
- log(`screen → ${before}`);
- }
- let acted = false;
- try {
- const decision = decideE2eAction(state, profile);
- if (decision.action) {
- driver.performAction(
- decision.action.id,
- decision.action.params ?? {},
- );
- acted = true;
- }
- if (decision.done) stop = true;
- } catch (e) {
- log(`driver action error on ${before}: ${(e as Error).message}`);
- }
- // If our own commit already advanced the screen (driver-driven sequences
- // like outro→mcp→slack→keep-skills), loop immediately to drive the next
- // one. Only block on waitForChange when we're waiting on an EXTERNAL
- // transition (the health probe, auth bootstrap, or the agent run).
- if (acted && store.currentScreen !== before) continue;
- await driver.waitForChange(600_000);
- }
- };
-
- const drive = driverLoop();
-
- // Reproduce run-wizard.ts headlessly: detection → init probe → gates → agent.
- await store.runReadyHooks();
- log(`detected: ${store.session.integration ?? '(none)'}`);
- store.runInitHooks(); // fires the health-check readiness probe
- await store.getGate('intro');
- await store.getGate('health-check');
- log('gates cleared (intro + health) — starting real agent');
-
- await runAgent(posthogIntegrationConfig, store.session);
- log(`agent run finished: runPhase=${store.session.runPhase}`);
-
- // Let the driver walk the post-run screens to completion.
- const deadline = Date.now() + 120_000;
- while (!store.session.skillsComplete && Date.now() < deadline) {
- await driver.waitForChange(5_000);
- }
- stop = true;
- await Promise.race([drive, Promise.resolve()]);
-
- // "Delete skills" is a KeepSkillsScreen side-effect (it `rm`s the
- // wizard-installed skill dirs), not a store setter — so the headless driver's
- // keep_skills{kept:false} only flips the flag. Replicate the deletion here, in
- // the orchestrator, where fs side-effects belong. Mirrors the screen: remove
- // each wizard-marked skill dir, then the skills/ dir if it's left empty.
- let skillsDeleted = false;
- if (profile.skills === 'delete') {
- const skillsDir = path.join(appDir, '.claude', 'skills');
- if (fs.existsSync(skillsDir)) {
- for (const dir of fs.readdirSync(skillsDir, { withFileTypes: true })) {
- if (!dir.isDirectory()) continue;
- if (fs.existsSync(path.join(skillsDir, dir.name, '.posthog-wizard'))) {
- fs.rmSync(path.join(skillsDir, dir.name), {
- recursive: true,
- force: true,
- });
- skillsDeleted = true;
- }
- }
- if (fs.readdirSync(skillsDir).length === 0) {
- fs.rmSync(skillsDir, { recursive: true, force: true });
- }
- }
- log(`skills deleted: ${skillsDeleted}`);
- }
-
- // Assertions: structured state + real file changes.
- const after = snapshot(appDir);
- const newDeps = after.deps.filter((d) => !before.deps.includes(d));
- const newFiles = [...after.files].filter((f) => !before.files.has(f));
- const hasPosthogDep = after.deps.some((d) =>
- d.toLowerCase().includes('posthog'),
- );
- // Detect a PostHog env file directly on disk (more robust than a file diff:
- // an .env may have pre-existed and only had keys appended).
- const envFile = [...after.files]
- .filter((f) => path.basename(f).startsWith('.env'))
- .find((f) => {
- try {
- return /POSTHOG/i.test(fs.readFileSync(path.join(appDir, f), 'utf8'));
- } catch {
- return false;
- }
- });
-
- log('');
- log('================ RESULT ================');
- log(`screen path : ${seen.join(' → ')}`);
- log(`runPhase : ${store.session.runPhase}`);
- log(`skillsComplete: ${store.session.skillsComplete}`);
- log(`new deps : ${newDeps.join(', ') || '(none)'}`);
- log(`posthog dep : ${hasPosthogDep}`);
- log(`new files : ${newFiles.join(', ') || '(none)'}`);
- log(`.env written: ${envFile ?? 'no'}`);
-
- const integrated =
- store.session.runPhase === RunPhase.Completed &&
- (hasPosthogDep || !!envFile);
- log(
- `\n${
- integrated ? '✓ FULL INTEGRATION LANDED' : '✗ integration incomplete'
- }`,
- );
- log('========================================');
-
- // Structured result for a harness/orchestrator (e.g. the workbench service) to
- // assert on — the control plane's payoff over stdout-grepping classic --ci.
- const result = {
- integrated,
- installDir: appDir,
- screenPath: seen,
- runPhase: store.session.runPhase,
- skillsComplete: store.session.skillsComplete,
- skillsPolicy: profile.skills,
- skillsDeleted,
- newDeps,
- hasPosthogDep,
- newFiles,
- envFile: envFile ?? null,
- };
- const resultPath = process.env.E2E_RESULT_JSON;
- if (resultPath) {
- fs.writeFileSync(resultPath, JSON.stringify(result, null, 2));
- log(`result json → ${resultPath}`);
- }
-
- // Save the run recording and tell the caller how to replay it.
- recorder.stop();
- const recordingPath =
- process.env.E2E_RECORDING_JSON ??
- `/tmp/wizard-e2e-${path.basename(appDir)}.recording.json`;
- fs.writeFileSync(
- recordingPath,
- JSON.stringify(recorder.getRecording(), null, 2),
- );
- log(`recording (${recorder.frameCount} frames) → ${recordingPath}`);
- log(`replay it: tsx scripts/replay-e2e.no-jest.ts ${recordingPath} --step`);
-
- process.exit(integrated ? 0 : 1);
-}
-
-main().catch((e) => {
- process.stderr.write(`\nE2E_FAIL: ${e?.stack ?? e}\n`);
- process.exit(1);
-});
-
-// Keep the rsync helper reference so the import isn't dropped by tree-shaking
-// in case a caller wants to copy from here later.
-export const _copy = (from: string, to: string) =>
- execFileSync('rsync', [
- '-a',
- '--exclude',
- 'node_modules',
- '--exclude',
- '.git',
- `${from}/`,
- `${to}/`,
- ]);
diff --git a/scripts/render-snapshots.no-jest.ts b/scripts/render-snapshots.no-jest.ts
deleted file mode 100644
index 3e0389c0..00000000
--- a/scripts/render-snapshots.no-jest.ts
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Render a recording to per-frame TUI snapshots — one `.ans` file per key
- * moment, the real Ink screen rendered to ANSI via replay's renderFrame (needs
- * real ink, hence tsx not jest). Feeds the workbench visual-comparison flow.
- *
- * tsx scripts/render-snapshots.no-jest.ts
- */
-import { mkdirSync, writeFileSync, rmSync } from 'fs';
-import { join } from 'path';
-import type { ProgramId } from '@ui/tui/router';
-import { loadRecording, renderFrame } from '@e2e-harness/replay';
-
-const [recordingPath, outDir] = process.argv.slice(2);
-if (!recordingPath || !outDir) {
- process.stderr.write('usage: render-snapshots \n');
- process.exit(2);
-}
-
-const rec = loadRecording(recordingPath);
-const program = rec.meta.program as ProgramId;
-
-rmSync(outDir, { recursive: true, force: true });
-mkdirSync(outDir, { recursive: true });
-
-for (const frame of rec.frames) {
- const seq = String(frame.seq).padStart(2, '0');
- // One file per key moment: -[-].ans
- const name = `${seq}-${frame.screen}.ans`;
- writeFileSync(join(outDir, name), renderFrame(frame, program));
-}
-
-process.stdout.write(`rendered ${rec.frames.length} snapshots → ${outDir}\n`);
diff --git a/scripts/replay-e2e.no-jest.ts b/scripts/replay-e2e.no-jest.ts
deleted file mode 100644
index f3420ab3..00000000
--- a/scripts/replay-e2e.no-jest.ts
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Replay a recorded wizard run in the terminal.
- *
- * tsx scripts/replay-e2e.no-jest.ts [--step] [--delay ]
- *
- * --step advance frame-by-frame on Enter (default)
- * --delay auto-play with between frames (e.g. --delay 1200)
- */
-import { createInterface } from 'readline';
-import type { ProgramId } from '@ui/tui/router';
-import { loadRecording, renderFrame, frameHeader } from '@e2e-harness/replay';
-
-const ENTER_ALT = '\x1b[?1049h';
-const LEAVE_ALT = '\x1b[?1049l';
-const CLEAR = '\x1b[2J\x1b[H';
-
-async function main() {
- const args = process.argv.slice(2);
- const file = args.find((a) => !a.startsWith('-'));
- if (!file) {
- process.stderr.write(
- 'usage: replay-e2e [--step] [--delay ]\n',
- );
- process.exit(2);
- }
- const delayArg = args.indexOf('--delay');
- const autoDelay = delayArg !== -1 ? Number(args[delayArg + 1]) : null;
- const step = autoDelay === null; // default to step unless --delay given
-
- const rec = loadRecording(file);
- const program = rec.meta.program as ProgramId;
- const total = rec.frames.length;
-
- process.stdout.write(ENTER_ALT);
- const cleanup = () => process.stdout.write(LEAVE_ALT);
- process.on('exit', cleanup);
-
- const rl = step
- ? createInterface({ input: process.stdin, output: process.stdout })
- : null;
- const ask = (q: string) =>
- new Promise((res) => rl!.question(q, () => res()));
- const sleep = (ms: number) => new Promise((res) => setTimeout(res, ms));
-
- for (const frame of rec.frames) {
- process.stdout.write(CLEAR);
- process.stdout.write(`\x1b[2m${rec.meta.app ?? rec.meta.program}\x1b[0m\n`);
- process.stdout.write(frameHeader(frame, total) + '\n\n');
- process.stdout.write(renderFrame(frame, program) + '\n');
- if (step) {
- await ask(
- `\n\x1b[2m[${
- frame.seq + 1
- }/${total}] Enter ▸ next · Ctrl-C ▸ quit\x1b[0m`,
- );
- } else {
- await sleep(autoDelay!);
- }
- }
-
- rl?.close();
- cleanup();
- process.stdout.write(`\nReplayed ${total} frames from ${file}\n`);
- process.exit(0);
-}
-
-main().catch((e) => {
- process.stdout.write(LEAVE_ALT);
- process.stderr.write(`replay failed: ${e?.stack ?? e}\n`);
- process.exit(1);
-});
diff --git a/scripts/tui-host.no-jest.ts b/scripts/tui-host.no-jest.ts
new file mode 100644
index 00000000..a9e68c0e
--- /dev/null
+++ b/scripts/tui-host.no-jest.ts
@@ -0,0 +1,257 @@
+/**
+ * Shared real-TUI host — the one primitive both e2e routes use.
+ *
+ * Runs the real `startTUI` (real ink render → this process's stdout, which the
+ * PTY parent captures) and drives its store by pure state manipulation via
+ * `WizardCiDriver` — no keystrokes. Auth is satisfied by `setCredentials` with
+ * the phx key (same bearer as an OAuth token).
+ *
+ * MODE=fixed — self-drive the fixed e2e profile, snapshotting each screen
+ * (the CI snapshot route).
+ * MODE=serve — listen on CONTROL_SOCK for {read_state, perform_action,
+ * set_credentials, run_agent} commands (the agent/MCP route).
+ *
+ * Never writes to stdout (that's the TUI); diagnostics go to the wizard log file.
+ */
+import fs from 'fs';
+import net from 'net';
+import { startTUI } from '@ui/tui/start-tui';
+import { VERSION } from '@lib/version';
+import { Program } from '@lib/programs/program-registry';
+import { buildSession } from '@lib/wizard-session';
+import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
+import { runAgent } from '@lib/agent/agent-runner';
+import { getOrAskForProjectData } from '@utils/setup-utils';
+import { logToFile } from '@utils/debug';
+import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
+import {
+ decideE2eAction,
+ type WizardE2eProfile,
+} from '@e2e-harness/e2e-profile';
+import { profileFor } from '@e2e-harness/profiles';
+
+const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
+const mark = (m: string) => logToFile(`[tui-host] ${m}`);
+
+async function main() {
+ const apiKey = (
+ process.env.POSTHOG_PERSONAL_API_KEY ??
+ (process.env.POSTHOG_KEY_FILE
+ ? fs.readFileSync(process.env.POSTHOG_KEY_FILE, 'utf8')
+ : '')
+ ).trim();
+ const projectId = process.env.PROJECT_ID!;
+
+ const { store } = startTUI(VERSION, Program.PostHogIntegration);
+ store.session = buildSession({
+ installDir: process.env.APP_DIR!,
+ ci: true,
+ apiKey,
+ projectId,
+ region: 'us',
+ });
+ const driver = new WizardCiDriver(store);
+
+ // Resolve credentials from the phx key (same bearer as an OAuth token) and set
+ // them on the store — advances the auth screen with no browser, no keystrokes.
+ const authByState = async () => {
+ const d = await getOrAskForProjectData({
+ signup: false,
+ ci: true,
+ apiKey,
+ projectId,
+ programId: Program.PostHogIntegration,
+ });
+ store.setCredentials({
+ accessToken: d.accessToken,
+ projectApiKey: d.projectApiKey,
+ host: d.host,
+ projectId: d.projectId,
+ });
+ };
+
+ if (process.env.MODE === 'serve') return serve();
+ return fixed();
+
+ // ---- agent route: drive commands over a unix socket ----
+ function serve() {
+ let runStatus: 'idle' | 'running' | 'done' | 'failed' = 'idle';
+ const handle = async (req: {
+ type: string;
+ action?: string;
+ params?: Record;
+ }) => {
+ try {
+ switch (req.type) {
+ case 'read_state':
+ return {
+ ok: true,
+ state: { ...driver.readState(), integration: runStatus },
+ };
+ case 'perform_action':
+ return {
+ ok: true,
+ state: driver.performAction(req.action!, req.params ?? {}),
+ };
+ case 'set_credentials':
+ await authByState();
+ return { ok: true, state: driver.readState() };
+ case 'run_agent': {
+ if (runStatus === 'running' || runStatus === 'done')
+ return { ok: true, runStatus };
+ runStatus = 'running';
+ void (async () => {
+ try {
+ await store.getGate('intro');
+ await store.getGate('health-check');
+ await runAgent(posthogIntegrationConfig, store.session);
+ runStatus = 'done';
+ } catch (e) {
+ runStatus = 'failed';
+ mark('run_agent error ' + (e as Error).message);
+ }
+ })();
+ return { ok: true, runStatus: 'running' };
+ }
+ default:
+ return { ok: false, error: `unknown command ${req.type}` };
+ }
+ } catch (e) {
+ return { ok: false, error: (e as Error).message };
+ }
+ };
+ const server = net.createServer((sock) => {
+ let buf = '';
+ sock.on('data', (d) => {
+ buf += d;
+ let i;
+ while ((i = buf.indexOf('\n')) >= 0) {
+ const line = buf.slice(0, i);
+ buf = buf.slice(i + 1);
+ if (!line.trim()) continue;
+ void handle(JSON.parse(line)).then((res) =>
+ sock.write(JSON.stringify(res) + '\n'),
+ );
+ }
+ });
+ });
+ const sockPath = process.env.CONTROL_SOCK!;
+ try {
+ fs.unlinkSync(sockPath);
+ } catch {
+ /* fresh */
+ }
+ server.listen(sockPath, () => mark(`serving on ${sockPath}`));
+ void store.runReadyHooks(); // detection so the intro screen fills in
+ }
+
+ // ---- CI route: self-drive the fixed profile, snapshot each screen ----
+ async function fixed() {
+ const CTRL = process.env.SNAP_CTRL!;
+ const runFull = process.env.RUN_AGENT === '1';
+ const profile: WizardE2eProfile = profileFor(Program.PostHogIntegration);
+ const screenPath: string[] = [];
+ let lastSnap = '';
+ const snap = async () => {
+ const s = store.currentScreen;
+ if (s === lastSnap) return;
+ lastSnap = s;
+ screenPath.push(s);
+ await sleep(650);
+ fs.appendFileSync(CTRL, s + '\n');
+ await sleep(400);
+ };
+ let stop = false;
+ const driverLoop = async () => {
+ while (!stop && !store.session.skillsComplete) {
+ await snap();
+ const state = driver.readState();
+ const before = state.currentScreen;
+ let acted = false;
+ try {
+ const decision = decideE2eAction(state, profile);
+ if (decision.action) {
+ driver.performAction(
+ decision.action.id,
+ decision.action.params ?? {},
+ );
+ acted = true;
+ }
+ if (decision.done) stop = true;
+ } catch (e) {
+ mark(`action error on ${before}: ${(e as Error).message}`);
+ }
+ if (acted && store.currentScreen !== before) continue;
+ if (!stop) await driver.waitForChange(600_000);
+ }
+ };
+ const drive = driverLoop();
+
+ await store.runReadyHooks();
+ await store.getGate('intro');
+ await store.getGate('health-check');
+
+ if (runFull) {
+ await runAgent(posthogIntegrationConfig, store.session);
+ const deadline = Date.now() + 120_000;
+ while (!store.session.skillsComplete && Date.now() < deadline)
+ await driver.waitForChange(5_000);
+ } else {
+ await authByState();
+ await sleep(2500);
+ await snap(); // the run screen
+ }
+ stop = true;
+ await drive;
+ await sleep(400);
+
+ // Structured result for the --e2e assertion path (same shape e2e-full-run had).
+ if (process.env.E2E_RESULT_JSON) {
+ const appDir = process.env.APP_DIR!;
+ let deps: string[] = [];
+ try {
+ const pkg = JSON.parse(
+ fs.readFileSync(`${appDir}/package.json`, 'utf8'),
+ );
+ deps = Object.keys({ ...pkg.dependencies, ...pkg.devDependencies });
+ } catch {
+ /* some frameworks have no package.json */
+ }
+ const posthogDeps = deps.filter((d) => d.includes('posthog'));
+ let envFile: string | null = null;
+ try {
+ const hit = fs
+ .readdirSync(appDir)
+ .find(
+ (f) =>
+ f.startsWith('.env') &&
+ /posthog/i.test(fs.readFileSync(`${appDir}/${f}`, 'utf8')),
+ );
+ envFile = hit ? `${appDir}/${hit}` : null;
+ } catch {
+ /* none */
+ }
+ fs.writeFileSync(
+ process.env.E2E_RESULT_JSON,
+ JSON.stringify(
+ {
+ runPhase: store.session.runPhase,
+ hasPosthogDep: posthogDeps.length > 0,
+ newDeps: posthogDeps,
+ envFile,
+ screenPath,
+ skillsComplete: store.session.skillsComplete,
+ },
+ null,
+ 2,
+ ),
+ );
+ }
+ process.exit(0);
+ }
+}
+
+main().catch((e) => {
+ mark('FATAL ' + (e?.stack ?? e));
+ process.exit(1);
+});
diff --git a/scripts/tui-replay.no-jest.ts b/scripts/tui-replay.no-jest.ts
new file mode 100644
index 00000000..55b2fa4b
--- /dev/null
+++ b/scripts/tui-replay.no-jest.ts
@@ -0,0 +1,59 @@
+/**
+ * Replay captured real-TUI snapshots in the terminal — step through or auto-play
+ * the `NN-.txt` frames a snapshot run dropped in SNAP_OUT.
+ *
+ * npx tsx scripts/tui-replay.no-jest.ts [--step | --delay ]
+ * pnpm wizard-ci-replay /tmp/snaps # Enter ▸ advance (default)
+ * pnpm wizard-ci-replay /tmp/snaps --delay 1200 # auto-play
+ */
+import fs from 'fs';
+import path from 'path';
+import { createInterface } from 'readline';
+
+const dir = process.argv[2] || process.env.SNAP_OUT;
+const args = process.argv.slice(3);
+const delayIdx = args.indexOf('--delay');
+const delay = delayIdx >= 0 ? Number(args[delayIdx + 1]) : null;
+
+const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
+const clear = () => process.stdout.write('\x1b[2J\x1b[3J\x1b[H');
+
+async function pressEnter(prompt: string): Promise {
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
+ await new Promise((res) => rl.question(prompt, () => res()));
+ rl.close();
+}
+
+async function main() {
+ if (!dir || !fs.existsSync(dir)) {
+ console.error(
+ `✖ snapshot dir not found: ${
+ dir ?? '(none)'
+ }\n usage: tui-replay [--step | --delay ]`,
+ );
+ process.exit(2);
+ }
+ const frames = fs
+ .readdirSync(dir)
+ .filter((f) => f.endsWith('.txt') && f !== 'latest.txt')
+ .sort();
+ if (frames.length === 0) {
+ console.error(`✖ no NN-.txt snapshots in ${dir}`);
+ process.exit(1);
+ }
+ // Step (Enter to advance) is the default; fall back to a timed play when not a
+ // TTY (e.g. CI) so it never hangs, or when --delay is given.
+ const autoMs = delay ?? (process.stdin.isTTY ? null : 600);
+
+ for (let i = 0; i < frames.length; i++) {
+ clear();
+ process.stdout.write(fs.readFileSync(path.join(dir, frames[i]), 'utf8'));
+ process.stdout.write(`\n [${i + 1}/${frames.length}] ${frames[i]}\n`);
+ if (i === frames.length - 1) break;
+ if (autoMs != null) await sleep(autoMs);
+ else await pressEnter(' ⏎ next ▸ ');
+ }
+ process.stdout.write('\n ✓ end of snapshots\n');
+ process.exit(0);
+}
+main();
diff --git a/scripts/tui-snapshots.no-jest.ts b/scripts/tui-snapshots.no-jest.ts
new file mode 100644
index 00000000..ef946466
--- /dev/null
+++ b/scripts/tui-snapshots.no-jest.ts
@@ -0,0 +1,60 @@
+/**
+ * Fixed-route snapshots of the REAL TUI (Node, single-stack).
+ *
+ * Spawns the real-TUI host (MODE=fixed) in a PTY, lets it self-drive the fixed
+ * e2e profile, and writes the real rendered screen to SNAP_OUT/NN-.txt
+ * each time the host signals a new screen via the control file.
+ *
+ * SNAP_OUT=/tmp/snaps APP_DIR=/tmp/app POSTHOG_KEY_FILE=… PROJECT_ID=… \
+ * RUN_AGENT=0|1 npx tsx scripts/tui-snapshots.no-jest.ts
+ */
+import fs from 'fs';
+import path from 'path';
+import { captureTui } from '@e2e-harness/tui-capture';
+
+const OUT = process.env.SNAP_OUT!;
+const CTRL = path.join(OUT, 'ctrl');
+fs.mkdirSync(OUT, { recursive: true });
+fs.writeFileSync(CTRL, '');
+
+const env: NodeJS.ProcessEnv = {
+ ...process.env,
+ MODE: 'fixed',
+ SNAP_CTRL: CTRL,
+};
+for (const k of Object.keys(env))
+ if (/^(CLAUDE|ANTHROPIC)/.test(k)) delete env[k]; // gateway auth via phx, not host creds
+
+const cap = captureTui({
+ cmd: path.join(process.cwd(), 'node_modules/.bin/tsx'),
+ args: ['scripts/tui-host.no-jest.ts'],
+ cwd: process.cwd(),
+ env,
+});
+
+const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
+let pos = 0;
+let seq = 0;
+async function drainCtrl() {
+ const data = fs.readFileSync(CTRL, 'utf8').slice(pos);
+ pos += data.length;
+ for (const raw of data.split('\n')) {
+ const label = raw.trim();
+ if (!label) continue;
+ await sleep(200); // let xterm apply the final writes for this screen
+ seq += 1;
+ const fn = path.join(OUT, `${String(seq).padStart(2, '0')}-${label}.txt`);
+ fs.writeFileSync(fn, cap.frame());
+ // eslint-disable-next-line no-console
+ console.log('snap ->', path.basename(fn));
+ }
+}
+
+const timer = setInterval(() => void drainCtrl(), 150);
+void cap.exited.then(async () => {
+ await drainCtrl();
+ clearInterval(timer);
+ // eslint-disable-next-line no-console
+ console.log(`done; ${seq} snapshots in ${OUT}`);
+ process.exit(0);
+});
diff --git a/scripts/wizard-ci-explore.no-jest.ts b/scripts/wizard-ci-explore.no-jest.ts
new file mode 100644
index 00000000..79313388
--- /dev/null
+++ b/scripts/wizard-ci-explore.no-jest.ts
@@ -0,0 +1,105 @@
+/**
+ * Quick eyeball test of the agent (MCP) route — without a full Claude session.
+ *
+ * Spawns the wizard-ci MCP server (which boots the real TUI host), drives a few
+ * steps over stdio JSON-RPC, and prints the REAL rendered screen that
+ * render_screen returns. Pass STEP=run to also kick off the integration.
+ *
+ * APP_DIR=/tmp/app POSTHOG_KEY_FILE=/path/phx.txt PROJECT_ID=228144 \
+ * npx tsx scripts/wizard-ci-explore.no-jest.ts
+ */
+import { spawn } from 'child_process';
+import path from 'path';
+
+const srv = spawn(
+ path.join(process.cwd(), 'node_modules/.bin/tsx'),
+ ['scripts/wizard-ci-mcp.no-jest.ts'],
+ { cwd: process.cwd(), stdio: ['pipe', 'pipe', 'inherit'] },
+);
+
+let buf = '';
+const pending = new Map<
+ number,
+ (m: { result: { content: Array<{ text: string }> } }) => void
+>();
+srv.stdout.on('data', (d) => {
+ buf += d;
+ let i;
+ while ((i = buf.indexOf('\n')) >= 0) {
+ const line = buf.slice(0, i);
+ buf = buf.slice(i + 1);
+ if (!line.trim()) continue;
+ let m;
+ try {
+ m = JSON.parse(line);
+ } catch {
+ continue;
+ }
+ if (m.id && pending.has(m.id)) {
+ pending.get(m.id)!(m);
+ pending.delete(m.id);
+ }
+ }
+});
+
+let idc = 0;
+const send = (
+ method: string,
+ params: unknown,
+): Promise<{ result: { content: Array<{ text: string }> } }> =>
+ new Promise((r) => {
+ const id = ++idc;
+ pending.set(id, r);
+ srv.stdin.write(
+ JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n',
+ );
+ });
+const notify = (method: string, params?: unknown) =>
+ srv.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n');
+const call = (name: string, args: Record = {}) =>
+ send('tools/call', { name, arguments: args });
+const out = (r: { result: { content: Array<{ text: string }> } }) =>
+ r.result.content[0].text;
+const screen = (r: { result: { content: Array<{ text: string }> } }) => {
+ try {
+ return JSON.parse(out(r)).currentScreen as string;
+ } catch {
+ return out(r);
+ }
+};
+
+async function main() {
+ await send('initialize', {
+ protocolVersion: '2024-11-05',
+ capabilities: {},
+ clientInfo: { name: 'explore', version: '1' },
+ });
+ notify('notifications/initialized');
+
+ const open = await call('open_app', {
+ appDir: process.env.APP_DIR,
+ keyFile: process.env.POSTHOG_KEY_FILE,
+ projectId: process.env.PROJECT_ID,
+ region: process.env.POSTHOG_REGION ?? 'us',
+ });
+ process.stdout.write(`open_app → ${screen(open)}\n`);
+ process.stdout.write(
+ `confirm_setup → ${screen(
+ await call('perform_action', { action: 'confirm_setup' }),
+ )}\n`,
+ );
+ process.stdout.write(
+ `read_state → ${screen(await call('read_state'))}\n`,
+ );
+
+ process.stdout.write('\n=== render_screen (the REAL TUI) ===\n');
+ process.stdout.write(out(await call('render_screen')));
+
+ srv.kill();
+ process.exit(0);
+}
+main().catch((e) => {
+ process.stderr.write(`explore error: ${e?.stack ?? e}\n`);
+ srv.kill();
+ process.exit(1);
+});
diff --git a/scripts/wizard-ci-mcp.no-jest.ts b/scripts/wizard-ci-mcp.no-jest.ts
index 4d2cd989..79f7beec 100644
--- a/scripts/wizard-ci-mcp.no-jest.ts
+++ b/scripts/wizard-ci-mcp.no-jest.ts
@@ -1,34 +1,22 @@
/**
- * wizard-ci-mcp — a stdio MCP server that holds one live WizardStore and exposes
- * it as tools, so an agent drives a real wizard run turn by turn: open_app →
- * read_state → perform_action → … → run_agent → … → keep_skills, rendering the
- * screen whenever it wants.
+ * wizard-ci-mcp — MCP server that lets an agent drive the REAL wizard TUI.
*
- * Registered in this repo's `.mcp.json`, so the tools are bound in every session
- * here — no per-run setup. It boots app-agnostic; `open_app` picks the app +
- * credentials at call time (so nothing secret lives in `.mcp.json`). It also
- * auto-opens from APP_DIR / POSTHOG_KEY_FILE env if those happen to be set.
+ * A thin proxy: it spawns the shared real-TUI host (scripts/tui-host.no-jest.ts,
+ * MODE=serve) in a PTY via the Node capturer, forwards read_state/perform_action/
+ * run_agent to it over a unix socket, and returns the REAL rendered screen for
+ * render_screen. No store or rendering lives here — same host the CI snapshot
+ * route uses. stdout is the JSON-RPC channel; nothing else writes to it.
*
- * stdout is the JSON-RPC channel — diagnostics go to stderr only.
+ * Registered in this repo's `.mcp.json`, so the tools are bound in every session.
*/
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { z } from 'zod';
import fs from 'fs';
-import { WizardStore } from '@ui/tui/store';
-import { InkUI } from '@ui/tui/ink-ui';
-import { setUI } from '@ui/index';
-import {
- buildSession,
- RunPhase,
- type WizardSession,
-} from '@lib/wizard-session';
-import { Program } from '@lib/programs/program-registry';
-import { posthogIntegrationConfig } from '@lib/programs/posthog-integration';
-import { runAgent } from '@lib/agent/agent-runner';
-import { WizardCiDriver } from '@e2e-harness/wizard-ci-driver';
-import { renderFrame } from '@e2e-harness/replay';
-import type { RecordedFrame } from '@e2e-harness/recorder';
+import os from 'os';
+import path from 'path';
+import net from 'net';
+import { captureTui, type TuiCapture } from '@e2e-harness/tui-capture';
const text = (data: unknown) => ({
content: [
@@ -38,7 +26,6 @@ const text = (data: unknown) => ({
},
],
});
-
const errorOut = (e: unknown) => ({
content: [
{
@@ -49,91 +36,53 @@ const errorOut = (e: unknown) => ({
isError: true,
});
-/** Render a store's current screen to ANSI (access token redacted). */
-function renderNow(store: WizardStore): string {
- const s = store.session;
- const session: WizardSession = s.credentials
- ? {
- ...s,
- credentials: { ...s.credentials, accessToken: 'phx_***redacted***' },
- }
- : s;
- const frame: RecordedFrame = {
- seq: 0,
- ms: 0,
- triggers: ['screen'],
- screen: store.currentScreen,
- hasOverlay: store.router.hasOverlay,
- session,
- tasks: store.tasks.map((t) => ({
- label: t.label,
- status: t.status,
- activeForm: t.activeForm,
- done: t.done,
- })),
- statusMessages: [...store.statusMessages],
- eventPlan: store.eventPlan.map((e) => ({
- name: e.name,
- description: e.description,
- })),
- };
- return renderFrame(frame, Program.PostHogIntegration);
-}
-
-type Live = { store: WizardStore; driver: WizardCiDriver };
-let active: Live | null = null;
+const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
-// run_agent runs in the background so the tool returns immediately; a multi-minute
-// blocking MCP call makes the client reconnect and lose this process's store.
-let runStatus: 'idle' | 'running' | 'done' | 'failed' = 'idle';
-let runError: string | null = null;
+let cap: TuiCapture | null = null;
+let sockPath = '';
-/** Boot a fresh live wizard on an app and make it the active run. */
-async function openApp(cfg: {
- appDir: string;
- apiKey: string;
- projectId: string;
- region: string;
-}): Promise {
- if (!cfg.appDir || !fs.existsSync(cfg.appDir))
- throw new Error(`appDir missing or not found: ${cfg.appDir}`);
- if (!cfg.apiKey)
- throw new Error('a PostHog key is required (keyFile or apiKey)');
- const store = new WizardStore(Program.PostHogIntegration);
- setUI(new InkUI(store)); // real UI, never rendered → no stdout
- store.session = buildSession({
- installDir: cfg.appDir,
- ci: true, // OAuth-bypass + ai-opt-in auto-consent; phx key as gateway bearer
- apiKey: cfg.apiKey,
- projectId: cfg.projectId,
- region: cfg.region,
+/** One request/response over the host's control socket (newline-delimited JSON). */
+function rpc(
+ req: object,
+): Promise<{
+ ok: boolean;
+ state?: unknown;
+ error?: string;
+ runStatus?: string;
+}> {
+ return new Promise((resolve, reject) => {
+ if (!sockPath)
+ return reject(new Error('No app open. Call open_app first.'));
+ const sock = net.connect(sockPath);
+ let buf = '';
+ const timer = setTimeout(() => {
+ sock.destroy();
+ reject(new Error('control socket timeout'));
+ }, 600_000);
+ sock.on('connect', () => sock.write(JSON.stringify(req) + '\n'));
+ sock.on('data', (d) => {
+ buf += d;
+ const i = buf.indexOf('\n');
+ if (i >= 0) {
+ clearTimeout(timer);
+ sock.end();
+ resolve(JSON.parse(buf.slice(0, i)));
+ }
+ });
+ sock.on('error', (e) => {
+ clearTimeout(timer);
+ reject(e);
+ });
});
- await store.runReadyHooks(); // framework detection
- store.runInitHooks(); // health-check readiness probe
- active = { store, driver: new WizardCiDriver(store) };
- return active;
}
-/** The active run, auto-opening from env if it was provided at launch. */
-async function ensure(): Promise {
- if (active) return active;
- const envKey = (
- process.env.POSTHOG_PERSONAL_API_KEY ??
- (process.env.POSTHOG_KEY_FILE
- ? fs.readFileSync(process.env.POSTHOG_KEY_FILE, 'utf8')
- : '')
- ).trim();
- if (process.env.APP_DIR && envKey)
- return openApp({
- appDir: process.env.APP_DIR,
- apiKey: envKey,
- projectId:
- process.env.PROJECT_ID ?? process.env.POSTHOG_WIZARD_PROJECT_ID ?? '',
- region: process.env.POSTHOG_REGION ?? 'us',
- });
- throw new Error(
- 'No app open. Call open_app({ appDir, keyFile, projectId, region }) first.',
- );
+async function waitFor(cond: () => boolean, ms: number): Promise {
+ const end = Date.now() + ms;
+ while (Date.now() < end) {
+ if (cond()) return true;
+ await sleep(150);
+ }
+ return false;
}
async function main() {
@@ -141,7 +90,7 @@ async function main() {
server.tool(
'open_app',
- 'Boot a live wizard run on an app and make it active. Call once before the other tools. appDir is a throwaway copy of the app to integrate. Returns the first screen.',
+ 'Boot the real wizard TUI on an app and make it active. Call once before the other tools. appDir is a throwaway copy of the app to integrate. Returns the first screen.',
{
appDir: z
.string()
@@ -164,16 +113,39 @@ async function main() {
},
async ({ appDir, keyFile, apiKey, projectId, region }) => {
try {
- const key = (
- apiKey ?? (keyFile ? fs.readFileSync(keyFile, 'utf8') : '')
- ).trim();
- const live = await openApp({
- appDir,
- apiKey: key,
- projectId,
- region: region ?? 'us',
+ if (cap) cap.kill();
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'wizard-ci-'));
+ sockPath = path.join(dir, 'host.sock');
+ const key =
+ keyFile ??
+ (() => {
+ const p = path.join(dir, 'key');
+ fs.writeFileSync(p, (apiKey ?? '').trim(), { mode: 0o600 });
+ return p;
+ })();
+ const env: NodeJS.ProcessEnv = { ...process.env };
+ for (const k of Object.keys(env))
+ if (/^(CLAUDE|ANTHROPIC)/.test(k)) delete env[k];
+ Object.assign(env, {
+ MODE: 'serve',
+ CONTROL_SOCK: sockPath,
+ SNAP_CTRL: path.join(dir, 'ctrl'),
+ APP_DIR: appDir,
+ POSTHOG_KEY_FILE: key,
+ PROJECT_ID: projectId,
+ POSTHOG_REGION: region ?? 'us',
});
- return text(live.driver.readState());
+ cap = captureTui({
+ cmd: path.join(process.cwd(), 'node_modules/.bin/tsx'),
+ args: ['scripts/tui-host.no-jest.ts'],
+ cwd: process.cwd(),
+ env,
+ });
+ if (!(await waitFor(() => fs.existsSync(sockPath), 30_000)))
+ return errorOut(new Error('the TUI host did not start'));
+ await waitFor(() => cap!.frame().includes('PostHog'), 30_000);
+ const r = await rpc({ type: 'read_state' });
+ return text(r.state ?? r);
} catch (e) {
return errorOut(e);
}
@@ -182,15 +154,12 @@ async function main() {
server.tool(
'read_state',
- "Read the wizard's committed state: current screen, run phase, a secret-free session view, agent tasks/status, any pending question, unresolved setup questions, and the actions legal right now. Call after every perform_action.",
+ "Read the wizard's committed state: current screen, run phase, a secret-free session view, tasks, pending question, and the actions legal now. Call after every perform_action and to poll run_agent (integration: running → done).",
{},
async () => {
try {
- const state = (await ensure()).driver.readState();
- const extra: Record = {};
- if (runStatus !== 'idle') extra.integration = runStatus;
- if (runError) extra.runError = runError;
- return text({ ...state, ...extra });
+ const r = await rpc({ type: 'read_state' });
+ return text(r.state ?? r);
} catch (e) {
return errorOut(e);
}
@@ -199,7 +168,7 @@ async function main() {
server.tool(
'perform_action',
- 'Commit a decision: invoke a legal action for the current screen (e.g. confirm_setup, dismiss_outage, choose, set_mcp_outcome, dismiss_slack, keep_skills). Returns the next state. The action must appear in read_state.actions.',
+ 'Commit a decision on the current screen (confirm_setup, dismiss_outage, choose, set_mcp_outcome, dismiss_slack, keep_skills). The action must appear in read_state.actions. Returns the next state.',
{
action: z.string().describe('Action id from read_state.actions'),
params: z
@@ -209,9 +178,12 @@ async function main() {
},
async ({ action, params }) => {
try {
- return text(
- (await ensure()).driver.performAction(action, params ?? {}),
- );
+ const r = await rpc({
+ type: 'perform_action',
+ action,
+ params: params ?? {},
+ });
+ return text(r.state ?? r);
} catch (e) {
return errorOut(e);
}
@@ -220,11 +192,13 @@ async function main() {
server.tool(
'render_screen',
- 'Render the current TUI screen to ANSI so you can see exactly what the user would.',
+ 'Return the REAL rendered TUI screen (ANSI-stripped text) — exactly what the user would see.',
{},
async () => {
try {
- return text(renderNow((await ensure()).store));
+ if (!cap) throw new Error('No app open. Call open_app first.');
+ await sleep(150); // let the emulator apply the latest frame
+ return text(cap.frame());
} catch (e) {
return errorOut(e);
}
@@ -233,46 +207,15 @@ async function main() {
server.tool(
'run_agent',
- 'Kick off the real wizard integration and return immediately — do NOT expect it to block. It bootstraps credentials from the key and runs the integration in the background; this is what advances the auth and run screens (they never advance on their own). Then POLL read_state every ~10s: runPhase goes running → completed and currentScreen advances to outro. Takes minutes, and creates real PostHog resources (a dashboard + insights) in the project. Call once setup is confirmed.',
+ 'Kick off the real integration in the background and return immediately. It advances the auth and run screens (they never advance on their own). Then poll read_state — integration goes running → done and currentScreen advances to outro. Creates real PostHog resources (a dashboard + insights). Call once setup is confirmed.',
{},
async () => {
try {
- const { store } = await ensure();
- if (runStatus === 'running')
- return text({
- status: 'already running — poll read_state',
- runPhase: store.session.runPhase,
- currentScreen: store.currentScreen,
- });
- if (
- runStatus === 'done' ||
- store.session.runPhase === RunPhase.Completed
- )
- return text({
- status: 'already completed',
- runPhase: store.session.runPhase,
- currentScreen: store.currentScreen,
- });
- runStatus = 'running';
- runError = null;
- // Background: returning now keeps the server responsive so the client
- // doesn't reconnect (which would drop this store). The agent polls.
- void (async () => {
- try {
- await store.getGate('intro');
- await store.getGate('health-check');
- await runAgent(posthogIntegrationConfig, store.session);
- runStatus = 'done';
- } catch (e) {
- runStatus = 'failed';
- runError = e instanceof Error ? e.message : String(e);
- }
- })();
+ const r = await rpc({ type: 'run_agent' });
return text({
status:
- 'integration started in the background — poll read_state every ~10s; runPhase goes running → completed and currentScreen advances to outro',
- runPhase: store.session.runPhase,
- currentScreen: store.currentScreen,
+ 'integration started in the background — poll read_state (integration: running → done; screen advances to outro)',
+ ...r,
});
} catch (e) {
return errorOut(e);
@@ -281,7 +224,7 @@ async function main() {
);
await server.connect(new StdioServerTransport());
- process.stderr.write('wizard-ci-mcp: ready on stdio\n');
+ process.stderr.write('wizard-ci-mcp: proxy ready on stdio\n');
}
main().catch((e) => {
From e120b2afc64f97f5f5333d680898aaea63cc1e72 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 21:58:52 -0400
Subject: [PATCH 33/38] docs: drop stale e2e-full-run reference from a comment
Co-Authored-By: Claude Opus 4.8
---
scripts/tui-host.no-jest.ts | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/scripts/tui-host.no-jest.ts b/scripts/tui-host.no-jest.ts
index a9e68c0e..72603f43 100644
--- a/scripts/tui-host.no-jest.ts
+++ b/scripts/tui-host.no-jest.ts
@@ -205,7 +205,8 @@ async function main() {
await drive;
await sleep(400);
- // Structured result for the --e2e assertion path (same shape e2e-full-run had).
+ // Structured result the --e2e assertion path reads: run phase, posthog deps,
+ // env file, and the screens walked.
if (process.env.E2E_RESULT_JSON) {
const appDir = process.env.APP_DIR!;
let deps: string[] = [];
From e87a322613b8bcabb26580255a715896c6cfea84 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 22:15:03 -0400
Subject: [PATCH 34/38] fix(tui-snapshots): capture the run screen's
progression + every transition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Snapshot on key moments — a screen change, a task-list update, or a runPhase
change — via a store subscription, and snap each screen before the driver acts on
it. The run screen (the agent working) is captured as it progresses, and fast
transitions (intro/auth/outro/mcp/slack) are no longer skipped by throttling.
Co-Authored-By: Claude Opus 4.8
---
scripts/tui-host.no-jest.ts | 49 ++++++++++++++++++++++++++++---------
1 file changed, 37 insertions(+), 12 deletions(-)
diff --git a/scripts/tui-host.no-jest.ts b/scripts/tui-host.no-jest.ts
index 72603f43..ac7e9026 100644
--- a/scripts/tui-host.no-jest.ts
+++ b/scripts/tui-host.no-jest.ts
@@ -151,20 +151,44 @@ async function main() {
const runFull = process.env.RUN_AGENT === '1';
const profile: WizardE2eProfile = profileFor(Program.PostHogIntegration);
const screenPath: string[] = [];
- let lastSnap = '';
- const snap = async () => {
- const s = store.currentScreen;
- if (s === lastSnap) return;
- lastSnap = s;
- screenPath.push(s);
- await sleep(650);
- fs.appendFileSync(CTRL, s + '\n');
- await sleep(400);
+ // Snapshot on key moments — a screen change, a task-list update, or a
+ // runPhase change — so the run screen's progression (the agent working) is
+ // captured, not just screen transitions. The driver loop snaps each screen
+ // before acting on it (so transitions are caught as presented); a store
+ // subscription catches within-screen changes (the run). Deduped by
+ // signature, serialized, and throttled.
+ let lastSig = '';
+ let lastSnapAt = 0;
+ let chain: Promise = Promise.resolve();
+ const signature = () =>
+ JSON.stringify({
+ screen: store.currentScreen,
+ overlay: store.router.hasOverlay,
+ tasks: store.tasks.map((t) => [t.label, t.status, t.done]),
+ phase: store.session.runPhase,
+ });
+ const snap = (): Promise => {
+ const sig = signature();
+ if (sig === lastSig) return chain;
+ lastSig = sig;
+ const screen = store.currentScreen;
+ if (screenPath[screenPath.length - 1] !== screen) screenPath.push(screen);
+ chain = chain.then(async () => {
+ const gap = Date.now() - lastSnapAt;
+ if (gap < 600) await sleep(600 - gap); // throttle bursts
+ await sleep(650); // settle (and let detection fill in on the intro)
+ fs.appendFileSync(CTRL, store.currentScreen + '\n');
+ lastSnapAt = Date.now();
+ await sleep(400); // give the capturer time before the screen moves on
+ });
+ return chain;
};
+ const unsub = store.subscribe(() => void snap());
+
let stop = false;
const driverLoop = async () => {
while (!stop && !store.session.skillsComplete) {
- await snap();
+ await snap(); // capture this screen as presented, before acting
const state = driver.readState();
const before = state.currentScreen;
let acted = false;
@@ -199,11 +223,12 @@ async function main() {
} else {
await authByState();
await sleep(2500);
- await snap(); // the run screen
}
stop = true;
await drive;
- await sleep(400);
+ unsub();
+ await snap(); // the final screen
+ await chain; // flush any pending snapshots
// Structured result the --e2e assertion path reads: run phase, posthog deps,
// env file, and the screens walked.
From 3bc6b1a1b7298b5f614dd3e77c13f8fb270b9e13 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 22:22:23 -0400
Subject: [PATCH 35/38] fix(tui-snapshots): drop the throttle; don't hang at
exit on the parked loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Snapshot on every key-moment change (no throttle spacing, just a settle). And
don't await the driver loop at exit — on the cheap (no-agent) path it's parked in
waitForChange, so awaiting it hung the process and exited non-zero, which would
fail CI. The process now exits 0 cleanly.
Co-Authored-By: Claude Opus 4.8
---
scripts/tui-host.no-jest.ts | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/scripts/tui-host.no-jest.ts b/scripts/tui-host.no-jest.ts
index ac7e9026..958e7b07 100644
--- a/scripts/tui-host.no-jest.ts
+++ b/scripts/tui-host.no-jest.ts
@@ -158,7 +158,6 @@ async function main() {
// subscription catches within-screen changes (the run). Deduped by
// signature, serialized, and throttled.
let lastSig = '';
- let lastSnapAt = 0;
let chain: Promise = Promise.resolve();
const signature = () =>
JSON.stringify({
@@ -174,12 +173,9 @@ async function main() {
const screen = store.currentScreen;
if (screenPath[screenPath.length - 1] !== screen) screenPath.push(screen);
chain = chain.then(async () => {
- const gap = Date.now() - lastSnapAt;
- if (gap < 600) await sleep(600 - gap); // throttle bursts
- await sleep(650); // settle (and let detection fill in on the intro)
+ await sleep(500); // settle: let the frame finish drawing
fs.appendFileSync(CTRL, store.currentScreen + '\n');
- lastSnapAt = Date.now();
- await sleep(400); // give the capturer time before the screen moves on
+ await sleep(300); // let the capturer capture before the screen moves on
});
return chain;
};
@@ -224,8 +220,11 @@ async function main() {
await authByState();
await sleep(2500);
}
+ // The driver loop has nothing left to do (full run reached skillsComplete; the
+ // cheap path stops after auth) but may be parked in waitForChange — don't
+ // block on it; the process exit below ends it.
stop = true;
- await drive;
+ void drive;
unsub();
await snap(); // the final screen
await chain; // flush any pending snapshots
From fd5599fb2818650187baf31ffc36f193bb2e2349 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 22:26:06 -0400
Subject: [PATCH 36/38] refactor(tui-snapshots): always run the agent; drop the
RUN_AGENT toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The fixed CI route always drives the full real agent run — a no-agent path was
pointless (and is what hung at exit). Removes the RUN_AGENT branch and the
auth-by-state shortcut it needed in fixed mode; auth is bootstrapped by the run.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/ARCHITECTURE.md | 6 +++---
scripts/README.md | 2 +-
scripts/tui-host.no-jest.ts | 21 +++++++--------------
scripts/tui-snapshots.no-jest.ts | 6 +++---
4 files changed, 14 insertions(+), 21 deletions(-)
diff --git a/e2e-harness/ARCHITECTURE.md b/e2e-harness/ARCHITECTURE.md
index 0cf9c2e5..0317ccfb 100644
--- a/e2e-harness/ARCHITECTURE.md
+++ b/e2e-harness/ARCHITECTURE.md
@@ -46,9 +46,9 @@ OAuth token takes, so the auth screen advances with no browser and no keystrokes
## The two routes
- **CI snapshots** — `tui-snapshots.no-jest.ts` spawns `tui-host` (`MODE=fixed`)
- in a PTY. The host self-drives the fixed profile (`decideE2eAction`) and signals
- each new screen; the parent writes the real rendered screen to
- `SNAP_OUT/NN-.txt`. `RUN_AGENT=1` runs the real agent through to outro.
+ in a PTY. The host self-drives the fixed profile (`decideE2eAction`) through the
+ real agent run and signals each key moment; the parent writes the real rendered
+ screen to `SNAP_OUT/NN-.txt` (including the run screen's progression).
- **Agent** — `wizard-ci-mcp.no-jest.ts` is a stdio MCP server that spawns
`tui-host` (`MODE=serve`) and proxies: `read_state` / `perform_action` /
`run_agent` forward over a unix socket; `render_screen` returns the real
diff --git a/scripts/README.md b/scripts/README.md
index 0a9aacbb..30bd9546 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -15,7 +15,7 @@ real ink render) and is driven purely by store state manipulation; a PTY parent
| Script | What it does | Needs |
| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
| **`tui-host.no-jest.ts`** | The real-TUI host. `MODE=fixed` self-drives the fixed e2e profile and signals each screen; `MODE=serve` accepts drive commands (`read_state`/`perform_action`/`run_agent`) over a unix socket. | `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID`; run under a PTY |
-| **`tui-snapshots.no-jest.ts`** | CI snapshot route: spawns `tui-host` (`MODE=fixed`) in a PTY and writes the **real rendered** screen to `SNAP_OUT/NN-.txt` at each key moment. `RUN_AGENT=1` for the full run through outro. | `SNAP_OUT`, `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID` |
+| **`tui-snapshots.no-jest.ts`** | CI snapshot route: spawns `tui-host` (`MODE=fixed`) in a PTY, runs the full real agent flow, and writes the **real rendered** screen to `SNAP_OUT/NN-.txt` at each key moment (incl. the run screen's progression). | `SNAP_OUT`, `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID` |
| **`wizard-ci-mcp.no-jest.ts`** | Agent route: a stdio **MCP server** that proxies `tui-host` (`MODE=serve`) — `read_state`/`perform_action`/`run_agent` forward over the socket, `render_screen` returns the real captured frame. | spawns the host itself; key passed via `open_app` |
| **`wizard-ci-explore.no-jest.ts`** | Quick eyeball of the agent route: drives the MCP server (`open_app → confirm_setup → render_screen`) and prints the real TUI. `pnpm wizard-ci-explore`. | `APP_DIR`, `POSTHOG_KEY_FILE`, `PROJECT_ID` |
diff --git a/scripts/tui-host.no-jest.ts b/scripts/tui-host.no-jest.ts
index 958e7b07..119563ca 100644
--- a/scripts/tui-host.no-jest.ts
+++ b/scripts/tui-host.no-jest.ts
@@ -148,7 +148,6 @@ async function main() {
// ---- CI route: self-drive the fixed profile, snapshot each screen ----
async function fixed() {
const CTRL = process.env.SNAP_CTRL!;
- const runFull = process.env.RUN_AGENT === '1';
const profile: WizardE2eProfile = profileFor(Program.PostHogIntegration);
const screenPath: string[] = [];
// Snapshot on key moments — a screen change, a task-list update, or a
@@ -156,7 +155,7 @@ async function main() {
// captured, not just screen transitions. The driver loop snaps each screen
// before acting on it (so transitions are caught as presented); a store
// subscription catches within-screen changes (the run). Deduped by
- // signature, serialized, and throttled.
+ // signature and serialized.
let lastSig = '';
let chain: Promise = Promise.resolve();
const signature = () =>
@@ -211,18 +210,12 @@ async function main() {
await store.getGate('intro');
await store.getGate('health-check');
- if (runFull) {
- await runAgent(posthogIntegrationConfig, store.session);
- const deadline = Date.now() + 120_000;
- while (!store.session.skillsComplete && Date.now() < deadline)
- await driver.waitForChange(5_000);
- } else {
- await authByState();
- await sleep(2500);
- }
- // The driver loop has nothing left to do (full run reached skillsComplete; the
- // cheap path stops after auth) but may be parked in waitForChange — don't
- // block on it; the process exit below ends it.
+ await runAgent(posthogIntegrationConfig, store.session);
+ const deadline = Date.now() + 120_000;
+ while (!store.session.skillsComplete && Date.now() < deadline)
+ await driver.waitForChange(5_000);
+ // The run reached skillsComplete, so the driver loop is done — but it may be
+ // parked in waitForChange, so don't block on it; the process exit ends it.
stop = true;
void drive;
unsub();
diff --git a/scripts/tui-snapshots.no-jest.ts b/scripts/tui-snapshots.no-jest.ts
index ef946466..4c0ee881 100644
--- a/scripts/tui-snapshots.no-jest.ts
+++ b/scripts/tui-snapshots.no-jest.ts
@@ -2,11 +2,11 @@
* Fixed-route snapshots of the REAL TUI (Node, single-stack).
*
* Spawns the real-TUI host (MODE=fixed) in a PTY, lets it self-drive the fixed
- * e2e profile, and writes the real rendered screen to SNAP_OUT/NN-.txt
- * each time the host signals a new screen via the control file.
+ * e2e profile through the real agent run, and writes the real rendered screen to
+ * SNAP_OUT/NN-.txt at each key moment the host signals.
*
* SNAP_OUT=/tmp/snaps APP_DIR=/tmp/app POSTHOG_KEY_FILE=… PROJECT_ID=… \
- * RUN_AGENT=0|1 npx tsx scripts/tui-snapshots.no-jest.ts
+ * npx tsx scripts/tui-snapshots.no-jest.ts
*/
import fs from 'fs';
import path from 'path';
From 4ed8691ada538d6812c1bd05ad6b6b52783bd187 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Mon, 22 Jun 2026 23:32:30 -0400
Subject: [PATCH 37/38] build: allow node-pty's build script (compiles pty.node
on Linux CI)
node-pty ships no linux-x64 prebuilt, so CI must compile it; pnpm 10 blocks build
scripts unless allowlisted.
Co-Authored-By: Claude Opus 4.8
---
package.json | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/package.json b/package.json
index bcdb743b..36b8255c 100644
--- a/package.json
+++ b/package.json
@@ -185,5 +185,10 @@
"volta": {
"node": "24.14.1",
"pnpm": "10.23.0"
+ },
+ "pnpm": {
+ "onlyBuiltDependencies": [
+ "node-pty"
+ ]
}
}
From 72a1051674aae86f8fc5a61841710ab1269c8d62 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge"
Date: Tue, 23 Jun 2026 00:44:13 -0400
Subject: [PATCH 38/38] fix(tui-capture): strip CI markers so ink renders the
real TUI
ink renders non-interactively when it detects CI (CI / GITHUB_ACTIONS), leaving
the captured xterm buffer blank. Strip them from the spawned host's env. Verified
locally: with CI=true, render_screen now returns the real TUI instead of blank.
Co-Authored-By: Claude Opus 4.8
---
e2e-harness/tui-capture.ts | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/e2e-harness/tui-capture.ts b/e2e-harness/tui-capture.ts
index d4bbdcfa..0a7ac388 100644
--- a/e2e-harness/tui-capture.ts
+++ b/e2e-harness/tui-capture.ts
@@ -59,12 +59,17 @@ export function captureTui(opts: {
const rows = opts.rows ?? (Number(process.env.PTY_ROWS) || 50);
ensureSpawnHelper();
const term = new Terminal({ cols, rows, allowProposedApi: true });
+ // Strip CI markers: ink renders non-interactively when it detects CI, which
+ // leaves the captured screen blank. We want the real interactive TUI.
+ const childEnv = { ...opts.env };
+ for (const k of ['CI', 'CONTINUOUS_INTEGRATION', 'GITHUB_ACTIONS'])
+ delete childEnv[k];
const child = pty.spawn(opts.cmd, opts.args, {
name: 'xterm-256color',
cols,
rows,
cwd: opts.cwd,
- env: opts.env as { [key: string]: string },
+ env: childEnv as { [key: string]: string },
});
const cbs: Array<() => void> = [];