From 1376a78f514f818dc87fa93745d2a2da62749f5c Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <gewenyu99@gmail.com>
Date: Mon, 8 Jun 2026 21:50:30 -0400
Subject: [PATCH 01/12] feat(orchestrator): flag gating + shared bootstrap
 extraction

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agent/__tests__/variant-gating.test.ts    |  36 ++++++
 src/lib/agent/agent-interface.ts              |  12 ++
 src/lib/agent/agent-runner.ts                 | 111 +++++++++++++++---
 src/lib/constants.ts                          |   2 +
 4 files changed, 143 insertions(+), 18 deletions(-)
 create mode 100644 src/lib/agent/__tests__/variant-gating.test.ts

diff --git a/src/lib/agent/__tests__/variant-gating.test.ts b/src/lib/agent/__tests__/variant-gating.test.ts
new file mode 100644
index 00000000..699bd096
--- /dev/null
+++ b/src/lib/agent/__tests__/variant-gating.test.ts
@@ -0,0 +1,36 @@
+import {
+  buildWizardMetadata,
+  isOrchestratorEnabled,
+} from '@lib/agent/agent-interface';
+
+describe('isOrchestratorEnabled', () => {
+  it('is true only when the wizard-orchestrator flag is true', () => {
+    expect(isOrchestratorEnabled({ 'wizard-orchestrator': 'true' })).toBe(true);
+  });
+
+  it('is false when the flag is false, another flag, or absent', () => {
+    expect(isOrchestratorEnabled({ 'wizard-orchestrator': 'false' })).toBe(
+      false,
+    );
+    expect(isOrchestratorEnabled({ 'wizard-variant': 'orchestrator' })).toBe(
+      false,
+    );
+    expect(isOrchestratorEnabled({})).toBe(false);
+    expect(isOrchestratorEnabled()).toBe(false);
+  });
+});
+
+describe('buildWizardMetadata', () => {
+  it('selects a known variant header from the flag', () => {
+    expect(buildWizardMetadata({ 'wizard-variant': 'subagents' })).toEqual({
+      VARIANT: 'subagents',
+    });
+  });
+
+  it('falls back to the base variant for unknown or missing flags', () => {
+    expect(buildWizardMetadata({ 'wizard-variant': 'nope' })).toEqual({
+      VARIANT: 'base',
+    });
+    expect(buildWizardMetadata({})).toEqual({ VARIANT: 'base' });
+  });
+});
diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts
index aefb5bc4..46f375a2 100644
--- a/src/lib/agent/agent-interface.ts
+++ b/src/lib/agent/agent-interface.ts
@@ -15,6 +15,7 @@ import {
   POSTHOG_PROPERTY_HEADER_PREFIX,
   WIZARD_VARIANT_FLAG_KEY,
   WIZARD_VARIANTS,
+  WIZARD_ORCHESTRATOR_FLAG_KEY,
   WIZARD_USER_AGENT,
 } from '@lib/constants';
 import {
@@ -245,6 +246,17 @@ export function buildWizardMetadata(
   return { ...variant };
 }
 
+/**
+ * Whether this run uses the experimental task-queue orchestrator. Gated by the
+ * boolean `wizard-orchestrator` feature flag, targeted to the user in the wizard's
+ * analytics project.
+ */
+export function isOrchestratorEnabled(
+  flags: Record<string, string> = {},
+): boolean {
+  return flags[WIZARD_ORCHESTRATOR_FLAG_KEY] === 'true';
+}
+
 /**
  * Build env for the SDK subprocess: process.env plus ANTHROPIC_CUSTOM_HEADERS, which always
  * includes `x-posthog-use-bedrock-fallback: true` so the LLM gateway falls back to Bedrock on
diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index 4af134ff..77c2f9ce 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -9,9 +9,11 @@
  *   - What MCP servers and package manager detector to use
  *   - What happens after the agent completes
  *
- * The pipeline itself is fixed:
- *   init → health check → settings → OAuth → [skill install] →
- *   agent init → prompt → run → errors → [postRun] → outro
+ * The pipeline runs a shared bootstrap (logging, health check, settings, OAuth,
+ * flags, MCP url), then forks. The `orchestrator` variant routes to the
+ * experimental task-queue runner. Every other variant runs the fixed linear
+ * pipeline:
+ *   [skill install] → agent init → prompt → run → errors → [postRun] → outro
  */
 
 import {
@@ -53,7 +55,7 @@ import { getSkillsBaseUrl } from '@lib/constants';
 import { runtimeEnv } from '@env';
 import { installSkillById, type InstallSkillResult } from '@lib/wizard-tools';
 import { createWizardAskBridge } from '@lib/wizard-ask-bridge';
-import type { WizardRunOptions } from '@utils/types';
+import type { WizardRunOptions, CloudRegion } from '@utils/types';
 
 import type { ProgramConfig } from '@lib/programs/program-step';
 import { assemblePrompt, type PromptContext } from './agent-prompt';
@@ -108,7 +110,7 @@ export interface ProgramRun {
   buildOutroData?: (
     session: WizardSession,
     credentials: Credentials,
-    cloudRegion: import('@utils/types').CloudRegion | undefined,
+    cloudRegion: CloudRegion | undefined,
   ) => WizardSession['outroData'];
   /**
    * Per-run cap on `wizard_ask` invocations. Defaults to 10. The 4th call
@@ -124,6 +126,23 @@ export interface ProgramRun {
   askTimeoutMs?: number;
 }
 
+/**
+ * Result of the shared bootstrap, consumed by both the linear and the
+ * orchestrator arm. Credentials, role, and user are already applied to the
+ * session by `bootstrapProgram`; this carries the values both arms still need.
+ */
+export interface BootstrapResult {
+  skillsBaseUrl: string;
+  projectApiKey: Credentials['projectApiKey'];
+  host: Credentials['host'];
+  accessToken: Credentials['accessToken'];
+  projectId: Credentials['projectId'];
+  cloudRegion: CloudRegion;
+  mcpUrl: string;
+  wizardFlags: Record<string, string>;
+  wizardMetadata: Record<string, string>;
+}
+
 // ── Helpers ──────────────────────────────────────────────────────────
 
 /**
@@ -179,16 +198,31 @@ export async function runAgent(
 /**
  * Run a program's agent pipeline.
  *
- * This is the single execution path for all programs — both skill-based
- * (revenue analytics) and framework-based (core integration). The
- * `ProgramRun` controls what varies between them; `programConfig` carries
- * the program-level static metadata (tool allow/disallow lists, etc.).
+ * Runs the shared bootstrap, then forks on the `wizard-variant` flag. The
+ * `orchestrator` variant routes to the experimental task-queue runner; every
+ * other variant runs the linear pipeline.
  */
 export async function runProgram(
   session: WizardSession,
   config: ProgramRun,
   programConfig: ProgramConfig,
 ): Promise<void> {
+  const boot = await bootstrapProgram(session, config, programConfig);
+
+  return runLinearProgram(session, config, programConfig, boot);
+}
+
+/**
+ * Shared setup for both arms: logging, health check, settings conflicts, OAuth
+ * and credentials, then the feature flags, variant metadata, and MCP url. Sets
+ * `session.credentials`, role, and user as a side effect. Returns the values the
+ * arms still need.
+ */
+async function bootstrapProgram(
+  session: WizardSession,
+  config: ProgramRun,
+  programConfig: ProgramConfig,
+): Promise<BootstrapResult> {
   // 1. Init logging + debug
   initLogFile();
   session.skillId = config.skillId ?? config.integrationLabel;
@@ -310,10 +344,60 @@ export async function runProgram(
   // install and agent start, so no source leaves the machine. The screen
   // alone is cosmetic; this await is the actual gate. Resolves
   // immediately when the program declared requiresAi: false or in CI.
+  // In bootstrapProgram so both the linear and orchestrator arms gate.
   logToFile('[agent-runner] checking AI opt-in gate');
   await getUI().waitForAiOptIn();
   logToFile('[agent-runner] AI opt-in gate cleared');
 
+  // Feature flags, variant metadata, and MCP url. Both arms need these, and the
+  // fork decision reads the flags.
+  const wizardFlags = await analytics.getAllFlagsForWizard();
+  const wizardMetadata = buildWizardMetadata(wizardFlags);
+
+  const mcpUrl = session.localMcp
+    ? 'http://localhost:8787/mcp'
+    : runtimeEnv('MCP_URL') ||
+      (cloudRegion === 'eu'
+        ? 'https://mcp-eu.posthog.com/mcp'
+        : 'https://mcp.posthog.com/mcp');
+
+  return {
+    skillsBaseUrl,
+    projectApiKey,
+    host,
+    accessToken,
+    projectId,
+    cloudRegion,
+    mcpUrl,
+    wizardFlags,
+    wizardMetadata,
+  };
+}
+
+/**
+ * The linear pipeline. Single execution path for all non-orchestrator programs,
+ * both skill-based (revenue analytics) and framework-based (core integration).
+ * The `ProgramRun` controls what varies between them; `programConfig` carries the
+ * program-level static metadata (tool allow/disallow lists, etc.).
+ */
+async function runLinearProgram(
+  session: WizardSession,
+  config: ProgramRun,
+  programConfig: ProgramConfig,
+  boot: BootstrapResult,
+): Promise<void> {
+  const {
+    skillsBaseUrl,
+    projectApiKey,
+    host,
+    accessToken,
+    projectId,
+    cloudRegion,
+    mcpUrl,
+    wizardFlags,
+    wizardMetadata,
+  } = boot;
+
   // 5. Skill install (if skillId provided)
   let skillPath: string | undefined;
   if (config.skillId) {
@@ -333,15 +417,6 @@ export async function runProgram(
 
   // 6. Initialize agent
   const spinner = getUI().spinner();
-  const wizardFlags = await analytics.getAllFlagsForWizard();
-  const wizardMetadata = buildWizardMetadata(wizardFlags);
-
-  const mcpUrl = session.localMcp
-    ? 'http://localhost:8787/mcp'
-    : runtimeEnv('MCP_URL') ||
-      (cloudRegion === 'eu'
-        ? 'https://mcp-eu.posthog.com/mcp'
-        : 'https://mcp.posthog.com/mcp');
 
   const restoreSettings = () => restoreClaudeSettings(session.installDir);
   getUI().onEnterScreen('outro', restoreSettings);
diff --git a/src/lib/constants.ts b/src/lib/constants.ts
index 055df06d..09eae21d 100644
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@@ -176,6 +176,8 @@ export const WIZARD_INTERACTION_EVENT_NAME = 'wizard interaction';
 export const WIZARD_REMARK_EVENT_NAME = 'wizard remark';
 /** Feature flag key whose value selects a variant from WIZARD_VARIANTS. */
 export const WIZARD_VARIANT_FLAG_KEY = 'wizard-variant';
+/** Boolean feature flag that routes a run to the experimental orchestrator runner. */
+export const WIZARD_ORCHESTRATOR_FLAG_KEY = 'wizard-orchestrator';
 /** Feature flag key that gates the intro-screen "Tools" menu. */
 export const WIZARD_TOOLS_MENU_FLAG_KEY = 'wizard-tools-menu';
 /** Variant key -> metadata for wizard run (VARIANT flag selects which entry to use). */

From c94739ed4267ff26ebc157814417faec4799b5d1 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:52:05 -0400
Subject: [PATCH 02/12] feat(orchestrator): in-memory queue + disk persistence
 (QueueStore) (#607)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../orchestrator/__tests__/queue.test.ts      | 135 ++++++++++
 src/lib/programs/orchestrator/queue.ts        | 239 ++++++++++++++++++
 src/lib/wizard-tools.ts                       |  23 +-
 src/utils/atomic-ledger.ts                    |  29 +++
 4 files changed, 406 insertions(+), 20 deletions(-)
 create mode 100644 src/lib/programs/orchestrator/__tests__/queue.test.ts
 create mode 100644 src/lib/programs/orchestrator/queue.ts
 create mode 100644 src/utils/atomic-ledger.ts

diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts
new file mode 100644
index 00000000..4a18dee2
--- /dev/null
+++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts
@@ -0,0 +1,135 @@
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  QueueStore,
+  type QueueFile,
+  type TaskHandoff,
+} from '@lib/programs/orchestrator/queue';
+
+function tmpDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-test-'));
+}
+
+describe('QueueStore', () => {
+  let dir: string;
+  let q: QueueStore;
+
+  beforeEach(() => {
+    dir = tmpDir();
+    q = new QueueStore(dir, 'run-1');
+  });
+
+  afterEach(() => {
+    fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('enqueues a pending task with defaults', () => {
+    const t = q.enqueue({ type: 'install' });
+    expect(t.status).toBe('pending');
+    expect(t.attempts).toBe(0);
+    expect(t.maxAttempts).toBe(2);
+    expect(t.enqueuedBy).toBe('orchestrator');
+    expect(t.dependsOn).toEqual([]);
+    expect(q.list()).toHaveLength(1);
+  });
+
+  it('only marks a task runnable once its dependencies are done', () => {
+    const a = q.enqueue({ type: 'install' });
+    const b = q.enqueue({ type: 'init', dependsOn: [a.id] });
+
+    expect(q.nextRunnable().map((t) => t.id)).toEqual([a.id]);
+
+    q.start(a.id);
+    q.complete(a.id);
+    expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]);
+  });
+
+  it('returns every runnable task; the graph alone decides parallelism', () => {
+    const a = q.enqueue({ type: 'install' });
+    const b = q.enqueue({ type: 'init' });
+    q.enqueue({ type: 'capture', dependsOn: [a.id, b.id] });
+
+    // Both independent tasks are runnable at once; the dependent one is not.
+    expect(
+      q
+        .nextRunnable()
+        .map((t) => t.id)
+        .sort(),
+    ).toEqual([a.id, b.id].sort());
+
+    q.start(a.id);
+    // An in-progress task is no longer offered.
+    expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]);
+  });
+
+  it('treats a skipped dependency as satisfied', () => {
+    const a = q.enqueue({ type: 'install' });
+    const b = q.enqueue({ type: 'init', dependsOn: [a.id] });
+
+    q.start(a.id);
+    q.skip(a.id);
+    expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]);
+  });
+
+  it('start increments attempts and supports within-run retry while attempts remain', () => {
+    const t = q.enqueue({ type: 'install', maxAttempts: 2 });
+    q.start(t.id);
+    expect(q.get(t.id)?.attempts).toBe(1);
+
+    q.fail(t.id, { type: 'API_ERROR', message: 'boom' });
+    expect(q.get(t.id)?.status).toBe('failed');
+
+    // Retry: attempts (1) < maxAttempts (2), so requeue and run again.
+    q.requeue(t.id);
+    expect(q.get(t.id)?.status).toBe('pending');
+    q.start(t.id);
+    expect(q.get(t.id)?.attempts).toBe(2);
+  });
+
+  it('completing a task records and reads back a structured handoff', () => {
+    const t = q.enqueue({ type: 'install' });
+    const handoff: TaskHandoff = {
+      goals: 'install the sdk',
+      did: 'added posthog-js',
+      forNextAgent: 'env vars not set yet',
+      filesTouched: ['package.json'],
+    };
+    q.start(t.id);
+    q.complete(t.id, handoff);
+
+    expect(q.get(t.id)?.status).toBe('done');
+    expect(q.readHandoff(t.id)).toEqual(handoff);
+    expect(q.readHandoffsByType('install')).toEqual([handoff]);
+  });
+
+  it('is drained when a pending task is blocked by a failed dependency', () => {
+    const a = q.enqueue({ type: 'install' });
+    q.enqueue({ type: 'init', dependsOn: [a.id] });
+
+    expect(q.isDrained()).toBe(false);
+    q.start(a.id);
+    q.fail(a.id, { type: 'API_ERROR', message: 'boom' });
+
+    // init can never run now, and nothing is in progress.
+    expect(q.nextRunnable()).toHaveLength(0);
+    expect(q.isDrained()).toBe(true);
+  });
+
+  it('reflects every transition to queue.json, handoffs included', () => {
+    const a = q.enqueue({ type: 'install' });
+    q.start(a.id);
+    q.complete(a.id, {
+      goals: 'g',
+      did: 'd',
+      forNextAgent: 'n',
+    });
+
+    const file = JSON.parse(fs.readFileSync(q.queuePath, 'utf8')) as QueueFile;
+    expect(file.version).toBe(1);
+    expect(file.runId).toBe('run-1');
+    expect(file.tasks).toHaveLength(1);
+    expect(file.tasks[0].status).toBe('done');
+    expect(file.tasks[0].handoff?.did).toBe('d');
+  });
+});
diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts
new file mode 100644
index 00000000..5f62c718
--- /dev/null
+++ b/src/lib/programs/orchestrator/queue.ts
@@ -0,0 +1,239 @@
+/**
+ * The orchestrator task queue.
+ *
+ * In memory, synchronous, single-owner: one Node process drives the run, so
+ * there is no locking. The queue imposes no execution policy — `nextRunnable`
+ * returns every pending task whose dependencies are satisfied, and how many of
+ * those run at once is decided by the task graph, not the queue.
+ *
+ * Every transition rewrites `<installDir>/.posthog-wizard/queue.json`, a small
+ * file holding the whole queue, handoffs included. Today it is the run's
+ * log and the report's source; later it is the resume point.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import { randomUUID } from 'crypto';
+import { writeJsonAtomic } from '../../../utils/atomic-ledger';
+
+export const TaskStatus = {
+  Pending: 'pending',
+  Running: 'running',
+  Done: 'done',
+  Skipped: 'skipped',
+  Failed: 'failed',
+} as const;
+
+export type TaskStatus = (typeof TaskStatus)[keyof typeof TaskStatus];
+
+export interface QueuedTask {
+  id: string;
+  type: string;
+  status: TaskStatus;
+  dependsOn: string[];
+  inputs: Record<string, unknown>;
+  model?: string;
+  attempts: number;
+  maxAttempts: number;
+  /** The structured handoff the task reported on completion. */
+  handoff?: TaskHandoff;
+  /** 'orchestrator' for seeded tasks, or the id of the task that enqueued this one. */
+  enqueuedBy: string;
+  createdAt: string;
+  startedAt?: string;
+  finishedAt?: string;
+  error?: { type: string; message: string };
+}
+
+export interface QueueFile {
+  version: 1;
+  runId: string;
+  tasks: QueuedTask[];
+}
+
+/** The structured handoff a task leaves for the next agent. */
+export interface TaskHandoff {
+  goals: string;
+  did: string;
+  forNextAgent: string;
+  filesTouched?: string[];
+}
+
+export interface EnqueueInput {
+  type: string;
+  inputs?: Record<string, unknown>;
+  dependsOn?: string[];
+  model?: string;
+  maxAttempts?: number;
+  enqueuedBy?: string;
+}
+
+export const QUEUE_DIR_NAME = '.posthog-wizard';
+const DEFAULT_MAX_ATTEMPTS = 2;
+
+function nowIso(): string {
+  return new Date().toISOString();
+}
+
+export class QueueStore {
+  private tasks: QueuedTask[] = [];
+
+  readonly runId: string;
+  readonly queuePath: string;
+
+  constructor(installDir: string, runId: string) {
+    this.runId = runId;
+    const dir = path.join(installDir, QUEUE_DIR_NAME);
+    this.queuePath = path.join(dir, 'queue.json');
+    fs.mkdirSync(dir, { recursive: true });
+  }
+
+  // ── Reads ───────────────────────────────────────────────────────────
+
+  list(): readonly QueuedTask[] {
+    return this.tasks;
+  }
+
+  get(id: string): QueuedTask | undefined {
+    return this.tasks.find((t) => t.id === id);
+  }
+
+  /**
+   * Every pending task whose dependencies are all satisfied (`done` or
+   * `skipped`). A skipped dependency does not block downstream work.
+   */
+  nextRunnable(): QueuedTask[] {
+    const doneIds = new Set(
+      this.tasks
+        .filter(
+          (t) =>
+            t.status === TaskStatus.Done || t.status === TaskStatus.Skipped,
+        )
+        .map((t) => t.id),
+    );
+    return this.tasks.filter(
+      (t) =>
+        t.status === TaskStatus.Pending &&
+        t.dependsOn.every((d) => doneIds.has(d)),
+    );
+  }
+
+  /**
+   * True when no task is in progress and none can be started. Either everything
+   * is terminal, or the only pending tasks are blocked by a failed dependency.
+   */
+  isDrained(): boolean {
+    if (this.tasks.some((t) => t.status === TaskStatus.Running)) return false;
+    return this.nextRunnable().length === 0;
+  }
+
+  summary(): Record<TaskStatus, number> & { total: number } {
+    const counts: Record<TaskStatus, number> = {
+      [TaskStatus.Pending]: 0,
+      [TaskStatus.Running]: 0,
+      [TaskStatus.Done]: 0,
+      [TaskStatus.Skipped]: 0,
+      [TaskStatus.Failed]: 0,
+    };
+    for (const t of this.tasks) counts[t.status] += 1;
+    return { ...counts, total: this.tasks.length };
+  }
+
+  readHandoff(id: string): TaskHandoff | null {
+    return this.get(id)?.handoff ?? null;
+  }
+
+  /** Handoffs of completed tasks of a given type, oldest first. */
+  readHandoffsByType(type: string): TaskHandoff[] {
+    return this.tasks
+      .filter((t) => t.type === type && t.handoff)
+      .map((t) => t.handoff as TaskHandoff);
+  }
+
+  // ── Transitions (each one reflected to queue.json) ──────────────────
+
+  enqueue(input: EnqueueInput): QueuedTask {
+    const task: QueuedTask = {
+      id: randomUUID(),
+      type: input.type,
+      status: TaskStatus.Pending,
+      dependsOn: input.dependsOn ?? [],
+      inputs: input.inputs ?? {},
+      model: input.model,
+      attempts: 0,
+      maxAttempts: input.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
+      enqueuedBy: input.enqueuedBy ?? 'orchestrator',
+      createdAt: nowIso(),
+    };
+    this.tasks.push(task);
+    this.reflect();
+    return task;
+  }
+
+  start(id: string): QueuedTask {
+    const t = this.require(id);
+    t.status = TaskStatus.Running;
+    t.startedAt = nowIso();
+    t.attempts += 1;
+    this.reflect();
+    return t;
+  }
+
+  complete(id: string, handoff?: TaskHandoff): QueuedTask {
+    return this.finish(id, TaskStatus.Done, handoff);
+  }
+
+  /** Terminal: the agent could not do the task. Not done, not failed. */
+  skip(id: string, handoff?: TaskHandoff): QueuedTask {
+    return this.finish(id, TaskStatus.Skipped, handoff);
+  }
+
+  fail(
+    id: string,
+    error: { type: string; message: string },
+    handoff?: TaskHandoff,
+  ): QueuedTask {
+    const t = this.require(id);
+    t.error = error;
+    return this.finish(id, TaskStatus.Failed, handoff);
+  }
+
+  /** Put a failed/in-progress task back to pending for a retry within the run. */
+  requeue(id: string): QueuedTask {
+    const t = this.require(id);
+    t.status = TaskStatus.Pending;
+    t.startedAt = undefined;
+    t.finishedAt = undefined;
+    this.reflect();
+    return t;
+  }
+
+  // ── Internals ───────────────────────────────────────────────────────
+
+  private finish(
+    id: string,
+    status: 'done' | 'skipped' | 'failed',
+    handoff?: TaskHandoff,
+  ): QueuedTask {
+    const t = this.require(id);
+    if (handoff) t.handoff = handoff;
+    t.status = status;
+    t.finishedAt = nowIso();
+    this.reflect();
+    return t;
+  }
+
+  private reflect(): void {
+    const file: QueueFile = {
+      version: 1,
+      runId: this.runId,
+      tasks: this.tasks,
+    };
+    writeJsonAtomic(this.queuePath, file);
+  }
+
+  private require(id: string): QueuedTask {
+    const t = this.get(id);
+    if (!t) throw new Error(`No task ${id} in the queue`);
+    return t;
+  }
+}
diff --git a/src/lib/wizard-tools.ts b/src/lib/wizard-tools.ts
index 7b2f6693..8d2f8d37 100644
--- a/src/lib/wizard-tools.ts
+++ b/src/lib/wizard-tools.ts
@@ -16,6 +16,7 @@ import { z } from 'zod';
 import { logToFile } from '@utils/debug';
 import { analytics } from '@utils/analytics';
 import { skillTmpPath } from '@utils/paths';
+import { writeJsonAtomic, makeMutex } from '@utils/atomic-ledger';
 import type { PackageManagerDetector } from './detection/package-manager';
 import {
   AUDIT_CHECKS_FILE,
@@ -389,14 +390,9 @@ const auditUpdateSchema = z.object({
   details: z.string().optional(),
 });
 
-/**
- * Atomically write JSON: write to .tmp then rename. The rename is what bumps
- * the file's mtime, which is what the UI's file watcher polls on.
- */
+/** Atomically write the audit ledger. Thin typed wrapper over writeJsonAtomic. */
 function writeLedgerAtomic(targetPath: string, checks: AuditCheck[]): void {
-  const tmpPath = `${targetPath}.tmp`;
-  fs.writeFileSync(tmpPath, JSON.stringify(checks, null, 2), 'utf8');
-  fs.renameSync(tmpPath, targetPath);
+  writeJsonAtomic(targetPath, checks);
 }
 
 /**
@@ -495,19 +491,6 @@ function appendAuditChecksToLedger(
   return { ok: true, added: additions.length };
 }
 
-/**
- * Single async mutex shared by audit tools — guarantees a read-modify-write
- * cycle on the ledger is atomic across concurrent tool calls (e.g. future subagents).
- */
-function makeMutex() {
-  let chain: Promise<unknown> = Promise.resolve();
-  return async function run<T>(fn: () => Promise<T> | T): Promise<T> {
-    const next = chain.then(() => fn());
-    chain = next.catch(() => undefined);
-    return next;
-  };
-}
-
 // ---------------------------------------------------------------------------
 // Server factory
 // ---------------------------------------------------------------------------
diff --git a/src/utils/atomic-ledger.ts b/src/utils/atomic-ledger.ts
new file mode 100644
index 00000000..0ae8c832
--- /dev/null
+++ b/src/utils/atomic-ledger.ts
@@ -0,0 +1,29 @@
+/**
+ * Small shared primitives for on-disk ledgers: an atomic JSON writer and a
+ * single-chain async mutex. Used by the audit tools and by the orchestrator
+ * queue. Lifted here so both share one implementation.
+ */
+import * as fs from 'fs';
+
+/**
+ * Atomically write JSON: write to a `.tmp` file then rename over the target. The
+ * rename bumps the file's mtime in one step, which is what a file watcher polls.
+ */
+export function writeJsonAtomic(targetPath: string, data: unknown): void {
+  const tmpPath = `${targetPath}.tmp`;
+  fs.writeFileSync(tmpPath, JSON.stringify(data, null, 2), 'utf8');
+  fs.renameSync(tmpPath, targetPath);
+}
+
+/**
+ * A single async mutex. Serializes read-modify-write cycles so concurrent callers
+ * (parallel task agents, audit tool calls) never interleave a mutation.
+ */
+export function makeMutex() {
+  let chain: Promise<unknown> = Promise.resolve();
+  return async function run<T>(fn: () => Promise<T> | T): Promise<T> {
+    const next = chain.then(() => fn());
+    chain = next.catch(() => undefined);
+    return next;
+  };
+}

From b20d5b911eabe35ec0996e7a7e54d25c391e32e1 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:56:29 -0400
Subject: [PATCH 03/12] feat(orchestrator): enqueue_task, complete_task,
 read_handoffs tools with guards (#608)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../__tests__/queue-tools.test.ts             | 127 +++++++++
 src/lib/programs/orchestrator/queue-tools.ts  | 259 ++++++++++++++++++
 src/lib/wizard-tools.ts                       |  21 ++
 3 files changed, 407 insertions(+)
 create mode 100644 src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
 create mode 100644 src/lib/programs/orchestrator/queue-tools.ts

diff --git a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
new file mode 100644
index 00000000..318825d2
--- /dev/null
+++ b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
@@ -0,0 +1,127 @@
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { QueueStore } from '@lib/programs/orchestrator/queue';
+import {
+  applyComplete,
+  applyEnqueue,
+  applyReadHandoffs,
+  checkEnqueueGuards,
+  type OrchestratorToolsContext,
+} from '@lib/programs/orchestrator/queue-tools';
+
+function tmpDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-tools-test-'));
+}
+
+const VALID = ['install', 'init', 'capture'];
+
+describe('checkEnqueueGuards', () => {
+  let dir: string;
+  let store: QueueStore;
+  let ctx: OrchestratorToolsContext;
+
+  beforeEach(() => {
+    dir = tmpDir();
+    store = new QueueStore(dir, 'run-1');
+    ctx = { store, validTypes: VALID };
+  });
+
+  afterEach(() => fs.rmSync(dir, { recursive: true, force: true }));
+
+  it('rejects an unknown type', () => {
+    const r = checkEnqueueGuards(ctx, { type: 'nope', reason: 'x' });
+    expect(r).toMatchObject({ ok: false, guard: 'unknown-type' });
+  });
+
+  it('rejects an unknown dependency', () => {
+    const r = checkEnqueueGuards(ctx, {
+      type: 'init',
+      dependsOn: ['ghost'],
+      reason: 'x',
+    });
+    expect(r).toMatchObject({ ok: false, guard: 'unknown-dep' });
+  });
+
+  it('trips dedup on the same type and inputs', () => {
+    store.enqueue({ type: 'install', inputs: { pkg: 'posthog-js' } });
+    const r = checkEnqueueGuards(ctx, {
+      type: 'install',
+      inputs: { pkg: 'posthog-js' },
+      reason: 'x',
+    });
+    expect(r).toMatchObject({ ok: false, guard: 'dedup' });
+  });
+
+  it('allows a valid enqueue', () => {
+    const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' });
+    expect(r).toEqual({ ok: true });
+  });
+});
+
+describe('apply functions', () => {
+  let dir: string;
+  let store: QueueStore;
+  let ctx: OrchestratorToolsContext;
+
+  beforeEach(() => {
+    dir = tmpDir();
+    store = new QueueStore(dir, 'run-1');
+    ctx = { store, validTypes: VALID };
+  });
+
+  afterEach(() => fs.rmSync(dir, { recursive: true, force: true }));
+
+  it('attributes a seed enqueue to the orchestrator', () => {
+    const r = applyEnqueue(ctx, { type: 'install', reason: 'seed' });
+    expect(r.ok).toBe(true);
+    if (!r.ok) return;
+    expect(r.task.enqueuedBy).toBe('orchestrator');
+  });
+
+  it('attributes a follow-up enqueue to the running task', () => {
+    const parent = store.enqueue({ type: 'init' });
+    ctx.currentTaskId = parent.id;
+    const r = applyEnqueue(ctx, { type: 'capture', reason: 'follow-up' });
+    expect(r.ok).toBe(true);
+    if (!r.ok) return;
+    expect(r.task.enqueuedBy).toBe(parent.id);
+  });
+
+  it('complete_task fails when no task is running', () => {
+    const r = applyComplete(ctx, {
+      status: 'done',
+      handoff: { goals: 'g', did: 'd', forNextAgent: 'n' },
+    });
+    expect(r.ok).toBe(false);
+  });
+
+  it('complete_task marks the running task done and stores the handoff', () => {
+    const t = store.enqueue({ type: 'install' });
+    ctx.currentTaskId = t.id;
+    store.start(t.id);
+    const r = applyComplete(ctx, {
+      status: 'done',
+      handoff: { goals: 'g', did: 'added sdk', forNextAgent: 'env next' },
+    });
+    expect(r.ok).toBe(true);
+    expect(store.get(t.id)?.status).toBe('done');
+    expect(store.readHandoff(t.id)?.did).toBe('added sdk');
+  });
+
+  it('read_handoffs returns a dependency handoff for the running task', () => {
+    const dep = store.enqueue({ type: 'install' });
+    store.start(dep.id);
+    store.complete(dep.id, {
+      goals: 'g',
+      did: 'installed',
+      forNextAgent: 'now init',
+    });
+    const t = store.enqueue({ type: 'init', dependsOn: [dep.id] });
+    ctx.currentTaskId = t.id;
+
+    const handoffs = applyReadHandoffs(ctx, {});
+    expect(handoffs).toHaveLength(1);
+    expect(handoffs[0].did).toBe('installed');
+  });
+});
diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts
new file mode 100644
index 00000000..6d3aad52
--- /dev/null
+++ b/src/lib/programs/orchestrator/queue-tools.ts
@@ -0,0 +1,259 @@
+/**
+ * Orchestrator MCP tools, registered into the existing `wizard-tools` server when
+ * a queue is present. They let the orchestrator agent and task agents grow the
+ * queue, report completion with a structured handoff, and read prior handoffs.
+ *
+ * The guard logic and the apply functions are plain, exported, and unit-tested.
+ * `buildOrchestratorTools` wraps them in the SDK `tool()` shape.
+ */
+import { z } from 'zod';
+import { analytics } from '../../../utils/analytics';
+import {
+  TaskStatus,
+  type QueueStore,
+  type QueuedTask,
+  type TaskHandoff,
+} from './queue';
+
+export interface OrchestratorToolsContext {
+  store: QueueStore;
+  /** Task types the registry knows about. enqueue_task rejects anything else. */
+  validTypes: readonly string[];
+  /**
+   * The id of the task this tool server is bound to. Each task agent gets its
+   * own wizard-tools server, so attribution holds when independent tasks run
+   * in parallel. Absent for the seed, which is not a task.
+   */
+  currentTaskId?: string;
+}
+
+export interface EnqueueArgs {
+  type: string;
+  inputs?: Record<string, unknown>;
+  dependsOn?: string[];
+  model?: string;
+  reason: string;
+}
+
+export type GuardResult =
+  | { ok: true }
+  | { ok: false; guard: string; message: string };
+
+function stableStringify(value: unknown): string {
+  if (value === null || typeof value !== 'object') return JSON.stringify(value);
+  if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`;
+  const entries = Object.entries(value as Record<string, unknown>).sort(
+    ([a], [b]) => a.localeCompare(b),
+  );
+  return `{${entries
+    .map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`)
+    .join(',')}}`;
+}
+
+function dedupKey(type: string, inputs: Record<string, unknown>): string {
+  return `${type}::${stableStringify(inputs)}`;
+}
+
+/**
+ * Validate an enqueue. Structural checks only — a real type, real dependencies,
+ * and not a literal duplicate. How much runs, and in what shape, is the task
+ * graph's business, not a knob's.
+ */
+export function checkEnqueueGuards(
+  ctx: OrchestratorToolsContext,
+  args: EnqueueArgs,
+): GuardResult {
+  const tasks = ctx.store.list();
+
+  if (!ctx.validTypes.includes(args.type)) {
+    return {
+      ok: false,
+      guard: 'unknown-type',
+      message: `Unknown task type "${
+        args.type
+      }". Valid types: ${ctx.validTypes.join(', ')}.`,
+    };
+  }
+
+  for (const dep of args.dependsOn ?? []) {
+    if (!ctx.store.get(dep)) {
+      return {
+        ok: false,
+        guard: 'unknown-dep',
+        message: `Dependency "${dep}" is not a known task id.`,
+      };
+    }
+  }
+
+  const key = dedupKey(args.type, args.inputs ?? {});
+  if (
+    tasks.some(
+      (t) =>
+        t.status !== TaskStatus.Failed && dedupKey(t.type, t.inputs) === key,
+    )
+  ) {
+    return {
+      ok: false,
+      guard: 'dedup',
+      message: `A "${args.type}" task with these inputs already exists.`,
+    };
+  }
+
+  return { ok: true };
+}
+
+export type EnqueueResult =
+  | { ok: true; task: QueuedTask }
+  | { ok: false; guard: string; message: string };
+
+export function applyEnqueue(
+  ctx: OrchestratorToolsContext,
+  args: EnqueueArgs,
+): EnqueueResult {
+  const guard = checkEnqueueGuards(ctx, args);
+  if (!guard.ok) return guard;
+
+  const task = ctx.store.enqueue({
+    type: args.type,
+    inputs: args.inputs ?? {},
+    dependsOn: args.dependsOn ?? [],
+    model: args.model,
+    enqueuedBy: ctx.currentTaskId ?? 'orchestrator',
+  });
+  return { ok: true, task };
+}
+
+export type CompleteResult = { ok: true } | { ok: false; message: string };
+
+export function applyComplete(
+  ctx: OrchestratorToolsContext,
+  args: { status: 'done' | 'failed' | 'skipped'; handoff: TaskHandoff },
+): CompleteResult {
+  const id = ctx.currentTaskId;
+  if (!id) {
+    return {
+      ok: false,
+      message: 'complete_task can only be called by a running task agent.',
+    };
+  }
+  if (args.status === TaskStatus.Failed) {
+    ctx.store.fail(
+      id,
+      { type: 'self-reported', message: args.handoff.forNextAgent },
+      args.handoff,
+    );
+  } else if (args.status === TaskStatus.Skipped) {
+    ctx.store.skip(id, args.handoff);
+  } else {
+    ctx.store.complete(id, args.handoff);
+  }
+  return { ok: true };
+}
+
+export function applyReadHandoffs(
+  ctx: OrchestratorToolsContext,
+  args: { type?: string; taskId?: string },
+): TaskHandoff[] {
+  if (args.taskId) {
+    const h = ctx.store.readHandoff(args.taskId);
+    return h ? [h] : [];
+  }
+  if (args.type) {
+    return ctx.store.readHandoffsByType(args.type);
+  }
+  // No filter: every handoff of a dependency of the current task.
+  const currentId = ctx.currentTaskId;
+  const current = currentId ? ctx.store.get(currentId) : undefined;
+  if (!current) return [];
+  return current.dependsOn
+    .map((depId) => ctx.store.readHandoff(depId))
+    .filter((h): h is TaskHandoff => h !== null);
+}
+
+const HANDOFF_SHAPE = {
+  goals: z.string().describe('What this task was asked to achieve.'),
+  did: z.string().describe('What you actually did.'),
+  forNextAgent: z.string().describe('What the next agent should know.'),
+  filesTouched: z.array(z.string()).optional(),
+};
+
+type SdkTool = (
+  name: string,
+  description: string,
+  // The SDK accepts a plain object of zod fields as the schema.
+  schema: Record<string, z.ZodTypeAny>,
+  handler: (args: never) => unknown,
+) => unknown;
+
+function textResult(text: string, isError = false) {
+  return { isError, content: [{ type: 'text' as const, text }] };
+}
+
+/**
+ * Build the orchestrator tools in the SDK `tool()` shape. Called from
+ * createWizardToolsServer only when a queue context is present.
+ */
+export function buildOrchestratorTools(
+  tool: SdkTool,
+  ctx: OrchestratorToolsContext,
+): unknown[] {
+  const enqueueTask = tool(
+    'enqueue_task',
+    'Add a task to the orchestrator queue. Use it to seed work and to enqueue follow-up work you discover. Keep tasks small and discrete.',
+    {
+      type: z
+        .string()
+        .describe(`The task type. One of: ${ctx.validTypes.join(', ')}.`),
+      inputs: z.record(z.unknown()).optional(),
+      dependsOn: z
+        .array(z.string())
+        .optional()
+        .describe('Task ids that must be done before this task runs.'),
+      model: z.string().optional(),
+      reason: z.string().describe('One line on why this task is needed.'),
+    },
+    ((args: EnqueueArgs) => {
+      const res = applyEnqueue(ctx, args);
+      if (!res.ok) {
+        analytics.wizardCapture('orchestrator guard tripped', {
+          guard: res.guard,
+          type: args.type,
+        });
+        return textResult(res.message, true);
+      }
+      return textResult(JSON.stringify({ id: res.task.id }));
+    }) as (args: never) => unknown,
+  );
+
+  const completeTask = tool(
+    'complete_task',
+    "Report the outcome of your task. Always call this exactly once when you finish, with a structured handoff for the next agent. Use status 'skipped' when the task does not apply to this project and you cannot do it (say why in the handoff) — not 'done'.",
+    {
+      status: z.enum(['done', 'failed', 'skipped']),
+      handoff: z.object(HANDOFF_SHAPE),
+    },
+    ((args: {
+      status: 'done' | 'failed' | 'skipped';
+      handoff: TaskHandoff;
+    }) => {
+      const res = applyComplete(ctx, args);
+      if (!res.ok) return textResult(res.message, true);
+      return textResult('ok');
+    }) as (args: never) => unknown,
+  );
+
+  const readHandoffs = tool(
+    'read_handoffs',
+    'Read structured handoffs from earlier tasks. With no argument, returns the handoffs of your dependencies.',
+    {
+      type: z.string().optional(),
+      taskId: z.string().optional(),
+    },
+    ((args: { type?: string; taskId?: string }) => {
+      const handoffs = applyReadHandoffs(ctx, args);
+      return textResult(JSON.stringify(handoffs, null, 2));
+    }) as (args: never) => unknown,
+  );
+
+  return [enqueueTask, completeTask, readHandoffs];
+}
diff --git a/src/lib/wizard-tools.ts b/src/lib/wizard-tools.ts
index 8d2f8d37..0f7e6cc5 100644
--- a/src/lib/wizard-tools.ts
+++ b/src/lib/wizard-tools.ts
@@ -26,6 +26,10 @@ import {
 } from './programs/audit/types';
 import type { WizardAskBridge } from './wizard-ask-bridge';
 import { createSecretVault, type SecretVault } from './secret-vault';
+import {
+  buildOrchestratorTools,
+  type OrchestratorToolsContext,
+} from './programs/orchestrator/queue-tools';
 
 // ---------------------------------------------------------------------------
 // SDK dynamic import (ESM module loaded once, cached)
@@ -224,6 +228,14 @@ export interface WizardToolsOptions {
    * (e.g. in unit tests), a fresh vault is created internally.
    */
   secretVault?: SecretVault;
+
+  /**
+   * Orchestrator queue context. Present only when the `wizard-orchestrator`
+   * flag routes the run to the orchestrator; when set, the orchestrator tools
+   * (enqueue_task, complete_task, read_handoffs) are registered. Absent on the
+   * linear path.
+   */
+  orchestrator?: OrchestratorToolsContext;
 }
 
 /** Default per-run cap on wizard_ask calls when no override is provided. */
@@ -509,6 +521,7 @@ export async function createWizardToolsServer(options: WizardToolsOptions) {
     askBridge,
     askMaxQuestions = DEFAULT_ASK_MAX_QUESTIONS,
     secretVault = createSecretVault(),
+    orchestrator,
   } = options;
   const sdk = await getSDKModule();
   const { tool, createSdkMcpServer } = sdk;
@@ -1108,6 +1121,10 @@ export async function createWizardToolsServer(options: WizardToolsOptions) {
 
   // -- Assemble server ------------------------------------------------------
 
+  const orchestratorTools = orchestrator
+    ? buildOrchestratorTools(tool, orchestrator)
+    : [];
+
   return createSdkMcpServer({
     name: SERVER_NAME,
     version: '1.0.0',
@@ -1121,6 +1138,7 @@ export async function createWizardToolsServer(options: WizardToolsOptions) {
       auditAddChecks,
       auditResolveChecks,
       wizardAsk,
+      ...orchestratorTools,
     ],
   });
 }
@@ -1140,6 +1158,9 @@ export const WIZARD_TOOL_NAMES = {
   auditAddChecks: `mcp__${SERVER_NAME}__audit_add_checks`,
   auditResolveChecks: `mcp__${SERVER_NAME}__audit_resolve_checks`,
   wizardAsk: `mcp__${SERVER_NAME}__wizard_ask`,
+  enqueueTask: `mcp__${SERVER_NAME}__enqueue_task`,
+  completeTask: `mcp__${SERVER_NAME}__complete_task`,
+  readHandoffs: `mcp__${SERVER_NAME}__read_handoffs`,
 } as const;
 
 // ---------------------------------------------------------------------------

From 56794568597f5669bce898b841819ebb912b2e20 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:57:23 -0400
Subject: [PATCH 04/12] feat(orchestrator): executor drain-loop scheduler
 (#609)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../orchestrator/__tests__/executor.test.ts   | 150 ++++++++++++++++++
 src/lib/programs/orchestrator/executor.ts     | 115 ++++++++++++++
 2 files changed, 265 insertions(+)
 create mode 100644 src/lib/programs/orchestrator/__tests__/executor.test.ts
 create mode 100644 src/lib/programs/orchestrator/executor.ts

diff --git a/src/lib/programs/orchestrator/__tests__/executor.test.ts b/src/lib/programs/orchestrator/__tests__/executor.test.ts
new file mode 100644
index 00000000..5665b9b2
--- /dev/null
+++ b/src/lib/programs/orchestrator/__tests__/executor.test.ts
@@ -0,0 +1,150 @@
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  QueueStore,
+  type QueuedTask,
+  type TaskHandoff,
+} from '@lib/programs/orchestrator/queue';
+import { drainQueue, type RunTask } from '@lib/programs/orchestrator/executor';
+
+jest.mock('@utils/analytics', () => ({
+  analytics: { captureException: jest.fn(), wizardCapture: jest.fn() },
+}));
+import { analytics } from '@utils/analytics';
+
+const HANDOFF: TaskHandoff = { goals: 'g', did: 'd', forNextAgent: 'n' };
+
+function tmpDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'executor-test-'));
+}
+
+describe('drainQueue', () => {
+  let dir: string;
+  let q: QueueStore;
+
+  beforeEach(() => {
+    dir = tmpDir();
+    q = new QueueStore(dir, 'run-1');
+  });
+
+  afterEach(() => fs.rmSync(dir, { recursive: true, force: true }));
+
+  const completing: RunTask = (task) => {
+    q.complete(task.id, HANDOFF);
+    return Promise.resolve();
+  };
+
+  it('runs a single task to done and drains', async () => {
+    const a = q.enqueue({ type: 'install' });
+    await drainQueue(q, completing, { maxStarts: 50 });
+    expect(q.get(a.id)?.status).toBe('done');
+    expect(q.isDrained()).toBe(true);
+  });
+
+  it('runs a dependent task only after its dependency completes', async () => {
+    const order: string[] = [];
+    const a = q.enqueue({ type: 'install' });
+    const b = q.enqueue({ type: 'init', dependsOn: [a.id] });
+    const runner: RunTask = (task) => {
+      order.push(task.type);
+      q.complete(task.id, HANDOFF);
+      return Promise.resolve();
+    };
+    await drainQueue(q, runner, { maxStarts: 50 });
+    expect(order).toEqual(['install', 'init']);
+    expect(q.get(b.id)?.status).toBe('done');
+  });
+
+  it('runs independent branches concurrently; the graph is the only schedule', async () => {
+    let active = 0;
+    let maxActive = 0;
+    const runner: RunTask = async (task) => {
+      active += 1;
+      maxActive = Math.max(maxActive, active);
+      await new Promise((r) => setTimeout(r, 5));
+      q.complete(task.id, HANDOFF);
+      active -= 1;
+    };
+    const a = q.enqueue({ type: 'install' });
+    const b = q.enqueue({ type: 'init' });
+    q.enqueue({ type: 'capture', dependsOn: [a.id, b.id] });
+    await drainQueue(q, runner, { maxStarts: 50 });
+    // install and init overlap; capture waits for both.
+    expect(maxActive).toBe(2);
+    expect(q.summary().done).toBe(3);
+  });
+
+  it('starts a dependent the moment its dependency finishes, not in waves', async () => {
+    const startedAt: Record<string, number> = {};
+    let clock = 0;
+    const runner: RunTask = async (task) => {
+      startedAt[task.type] = clock++;
+      // slow holds the wave open; fast finishes early and unblocks after-fast.
+      const delay = task.type === 'slow' ? 30 : 5;
+      await new Promise((r) => setTimeout(r, delay));
+      q.complete(task.id, HANDOFF);
+    };
+    q.enqueue({ type: 'slow' });
+    const fast = q.enqueue({ type: 'fast' });
+    q.enqueue({ type: 'after-fast', dependsOn: [fast.id] });
+    await drainQueue(q, runner, { maxStarts: 50 });
+    // after-fast started while slow was still running.
+    expect(startedAt['after-fast']).toBeDefined();
+    expect(q.summary().done).toBe(3);
+  });
+
+  it('retries a task that ends without reporting, then fails it', async () => {
+    const a = q.enqueue({ type: 'install', maxAttempts: 2 });
+    const noReport: RunTask = async () => {
+      /* agent never calls complete_task */
+    };
+    await drainQueue(q, noReport, { maxStarts: 50 });
+    expect(q.get(a.id)?.status).toBe('failed');
+    expect(q.get(a.id)?.attempts).toBe(2);
+  });
+
+  it('succeeds on a retry within the attempt budget', async () => {
+    let calls = 0;
+    const a = q.enqueue({ type: 'install', maxAttempts: 3 });
+    const flaky: RunTask = (task: QueuedTask) => {
+      calls += 1;
+      if (calls >= 2) q.complete(task.id, HANDOFF);
+      return Promise.resolve();
+    };
+    await drainQueue(q, flaky, { maxStarts: 50 });
+    expect(q.get(a.id)?.status).toBe('done');
+    expect(calls).toBe(2);
+  });
+
+  it('captures and fails a task whose runner throws', async () => {
+    const a = q.enqueue({ type: 'install', maxAttempts: 1 });
+    const throwing: RunTask = () => Promise.reject(new Error('agent exploded'));
+    await drainQueue(q, throwing, { maxStarts: 50 });
+    expect(q.get(a.id)?.status).toBe('failed');
+    expect(analytics.captureException).toHaveBeenCalled();
+  });
+
+  it('does not run a task whose dependency failed', async () => {
+    const a = q.enqueue({ type: 'install', maxAttempts: 1 });
+    const b = q.enqueue({ type: 'init', dependsOn: [a.id] });
+    const runner: RunTask = (task) => {
+      if (task.type === 'init') q.complete(task.id, HANDOFF);
+      // install never reports, so it fails after its single attempt.
+      return Promise.resolve();
+    };
+    await drainQueue(q, runner, { maxStarts: 50 });
+    expect(q.get(a.id)?.status).toBe('failed');
+    expect(q.get(b.id)?.status).toBe('pending');
+    expect(q.isDrained()).toBe(true);
+  });
+
+  it('terminates via the start backstop instead of looping forever', async () => {
+    const a = q.enqueue({ type: 'install', maxAttempts: 999 });
+    const neverReports: RunTask = async () => {
+      /* would retry forever without the backstop */
+    };
+    await drainQueue(q, neverReports, { maxStarts: 3 });
+    expect(q.get(a.id)?.attempts).toBeLessThanOrEqual(3);
+  });
+});
diff --git a/src/lib/programs/orchestrator/executor.ts b/src/lib/programs/orchestrator/executor.ts
new file mode 100644
index 00000000..abf0ed15
--- /dev/null
+++ b/src/lib/programs/orchestrator/executor.ts
@@ -0,0 +1,115 @@
+/**
+ * The executor drains the queue. It starts every runnable task (dependencies
+ * satisfied) as soon as it becomes runnable — parallelism is decided by the
+ * task graph, not by an executor knob. Each task runs through an injected
+ * `runTask` function and reports its outcome via `complete_task`; a task that
+ * ends without reporting is retried while attempts remain, then failed. A
+ * `maxStarts` backstop guarantees termination.
+ *
+ * The drain loop is independent of how a task actually runs. `runTask` is
+ * injected: the real one spins up a fresh agent, the tests use a fake.
+ */
+import { analytics } from '../../../utils/analytics';
+import { logToFile } from '../../../utils/debug';
+import { TaskStatus, type QueueStore, type QueuedTask } from './queue';
+
+/** Per-task agent configuration the resolver produces from a task's type. */
+export interface ResolvedTask {
+  model: string;
+  allowedTools: readonly string[];
+  disallowedTools: readonly string[];
+  /** Mini-skills to install before the task runs (the HOW). */
+  skills: readonly string[];
+  prompt: string;
+}
+
+/** Resolves a queued task to what the agent needs. The real one is markdown-backed. */
+export type TaskResolver = (
+  task: QueuedTask,
+  store: QueueStore,
+) => ResolvedTask;
+
+/** Runs one task's agent. It is expected to drive the task to a terminal state
+ *  (via the task agent calling complete_task). */
+export type RunTask = (task: QueuedTask) => Promise<void>;
+
+export interface DrainOptions {
+  /** Backstop against a pathological always-one-more-pending loop. */
+  maxStarts: number;
+}
+
+export const DEFAULT_DRAIN_OPTIONS: DrainOptions = {
+  maxStarts: 200,
+};
+
+async function runOne(
+  store: QueueStore,
+  runTask: RunTask,
+  task: QueuedTask,
+): Promise<void> {
+  store.start(task.id);
+  try {
+    await runTask(task);
+  } catch (error) {
+    // The task threw rather than reporting. The outcome check below handles
+    // the queue; the exception itself should never be silent.
+    logToFile(`[executor] runTask threw for ${task.type}:`, error);
+    analytics.captureException(
+      error instanceof Error ? error : new Error(String(error)),
+      { step: 'orchestrator_run_task', task_type: task.type },
+    );
+  }
+
+  const after = store.get(task.id);
+  if (!after) return;
+
+  if (after.status === TaskStatus.Running) {
+    // The agent ended without calling complete_task. Retry or fail.
+    if (after.attempts < after.maxAttempts) {
+      store.requeue(task.id);
+    } else {
+      store.fail(task.id, {
+        type: 'no-report',
+        message: 'Task ended without calling complete_task.',
+      });
+    }
+    return;
+  }
+
+  if (
+    after.status === TaskStatus.Failed &&
+    after.attempts < after.maxAttempts
+  ) {
+    store.requeue(task.id);
+  }
+}
+
+/**
+ * Drain the queue to a terminal state. Every runnable task starts the moment
+ * its dependencies finish; independent branches run concurrently. Returns when
+ * every task is done, failed, or blocked by a failed dependency, or when the
+ * start backstop trips.
+ */
+export async function drainQueue(
+  store: QueueStore,
+  runTask: RunTask,
+  opts: DrainOptions = DEFAULT_DRAIN_OPTIONS,
+): Promise<void> {
+  const running = new Map<string, Promise<void>>();
+  let starts = 0;
+
+  for (;;) {
+    for (const task of store.nextRunnable()) {
+      if (++starts > opts.maxStarts) break;
+      // runOne marks the task in_progress synchronously, so the next
+      // nextRunnable() call no longer offers it.
+      const p = runOne(store, runTask, task).finally(() =>
+        running.delete(task.id),
+      );
+      running.set(task.id, p);
+    }
+    if (running.size === 0) break;
+    // Wake on the first finish; it may have unblocked dependents or requeued.
+    await Promise.race(running.values());
+  }
+}

From 6c2318d7ff80e8f19164cbf93e8f1fcda062a481 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:18:49 -0400
Subject: [PATCH 05/12] feat(orchestrator): markdown-backed agent loader + full
 integration flow (#619)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/lib/agent/agent-interface.ts              |  31 +-
 src/lib/agent/agent-runner.ts                 |  17 +-
 .../__tests__/agent-prompt-loader.test.ts     | 205 ++++++++++++
 .../orchestrator/agent-prompt-loader.ts       | 310 ++++++++++++++++++
 .../orchestrator/orchestrator-runner.ts       | 296 +++++++++++++++++
 src/lib/programs/orchestrator/queue-tools.ts  |  14 +
 src/lib/programs/orchestrator/queue.ts        |   6 +
 src/lib/task-stream/task-stream-push.ts       |   2 +
 src/ui/logging-ui.ts                          |  18 +-
 src/ui/tui/primitives/ProgressList.tsx        |  28 +-
 src/ui/wizard-ui.ts                           |   1 +
 11 files changed, 902 insertions(+), 26 deletions(-)
 create mode 100644 src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
 create mode 100644 src/lib/programs/orchestrator/agent-prompt-loader.ts
 create mode 100644 src/lib/programs/orchestrator/orchestrator-runner.ts

diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts
index 46f375a2..07342385 100644
--- a/src/lib/agent/agent-interface.ts
+++ b/src/lib/agent/agent-interface.ts
@@ -147,6 +147,12 @@ export type AgentConfig = {
   getPendingQuestion?: () =>
     | import('@lib/wizard-session').PendingQuestion
     | null;
+  /**
+   * Orchestrator queue context. Present only when the `wizard-orchestrator`
+   * flag routes the run here; threaded into wizard-tools so the orchestrator
+   * tools register.
+   */
+  orchestrator?: import('@lib/programs/orchestrator/queue-tools').OrchestratorToolsContext;
 };
 
 /**
@@ -168,6 +174,7 @@ export type StopHookResult =
 export function createStopHook(
   featureQueue: readonly AdditionalFeature[],
   signals?: AgentOutputSignals,
+  requestRemark = true,
 ): (input: { stop_hook_active: boolean }) => StopHookResult {
   let featureIndex = 0;
   let remarkRequested = false;
@@ -195,8 +202,9 @@ export function createStopHook(
       return { decision: 'block', reason: prompt };
     }
 
-    // Phase 2: collect remark (once)
-    if (!remarkRequested) {
+    // Phase 2: collect remark (once). Skipped when the caller opts out — the
+    // orchestrator suppresses it per task so it does not fire on every agent.
+    if (requestRemark && !remarkRequested) {
       remarkRequested = true;
       logToFile('Stop hook: requesting reflection');
       return {
@@ -537,8 +545,6 @@ export async function initializeAgent(
   logToFile('Agent initialization starting');
   logToFile('Install directory:', options.installDir);
 
-  getUI().log.step('Initializing Claude agent...');
-
   try {
     // Configure LLM gateway environment variables (inherited by SDK subprocess)
     const gatewayUrl = getLlmGatewayUrlFromHost(config.posthogApiHost);
@@ -590,6 +596,7 @@ export async function initializeAgent(
       skillsBaseUrl: config.skillsBaseUrl,
       askBridge: config.askBridge,
       askMaxQuestions: config.askMaxQuestions,
+      orchestrator: config.orchestrator,
     });
     mcpServers['wizard-tools'] = wizardToolsServer;
 
@@ -624,8 +631,6 @@ export async function initializeAgent(
       });
     }
 
-    getUI().log.step(`Verbose logs: ${getLogFilePath()}`);
-    getUI().log.success("Agent initialized. Let's get cooking!");
     return agentRunConfig;
   } catch (error) {
     getUI().log.error(
@@ -671,6 +676,8 @@ export async function runAgent(
     errorMessage?: string;
     additionalFeatureQueue?: readonly AdditionalFeature[];
     abortCases?: readonly AbortCaseMatcher[];
+    /** Request the end-of-run reflection remark. Defaults to true. */
+    requestRemark?: boolean;
   },
   middleware?: {
     onMessage(message: any): void;
@@ -930,7 +937,11 @@ export async function runAgent(
           Stop: [
             {
               hooks: [
-                createStopHook(config?.additionalFeatureQueue ?? [], signals),
+                createStopHook(
+                  config?.additionalFeatureQueue ?? [],
+                  signals,
+                  config?.requestRemark ?? true,
+                ),
               ],
               timeout: 30,
             },
@@ -978,6 +989,7 @@ export async function runAgent(
         signals,
         receivedSuccessResult,
         tasks,
+        isOrchestratorEnabled(agentConfig.wizardFlags ?? {}),
       );
 
       // [ABORT] detection: the skill emits "[ABORT] <reason>" when it
@@ -1327,6 +1339,9 @@ function handleSDKMessage(
   signals: AgentOutputSignals,
   receivedSuccessResult = false,
   tasks?: Map<string, TaskEntry>,
+  // The orchestrator owns the TUI task panel (it renders its queue). Suppress the
+  // agent's own TaskCreate/TaskUpdate rendering so it does not clobber the queue.
+  suppressTaskRender = false,
 ): void {
   // Map preserves insertion order (the order the agent created the tasks).
   // Within that, group by status: completed first, then in_progress, then
@@ -1338,7 +1353,7 @@ function handleSDKMessage(
   };
   const rank = (status: string): number => STATUS_RANK[status] ?? 2;
   const syncTasks = (): void => {
-    if (!tasks) return;
+    if (!tasks || suppressTaskRender) return;
     const sorted = Array.from(tasks.values()).sort(
       (a, b) => rank(a.status) - rank(b.status),
     );
diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index 77c2f9ce..07a89c0c 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -31,12 +31,14 @@ import {
   AgentErrorType,
   AgentSignals,
   buildWizardMetadata,
+  isOrchestratorEnabled,
 } from './agent-interface';
 import {
   checkAllSettingsConflicts,
   backupAndFixClaudeSettings,
   restoreClaudeSettings,
 } from './claude-settings';
+import { runOrchestrator } from '../programs/orchestrator/orchestrator-runner';
 import { getCloudUrlFromRegion } from '@utils/urls';
 import {
   evaluateWizardReadiness,
@@ -45,7 +47,12 @@ import {
   getBlockingServiceKeys,
   SERVICE_LABELS,
 } from '@lib/health-checks/readiness';
-import { enableDebugLogs, initLogFile, logToFile } from '@utils/debug';
+import {
+  enableDebugLogs,
+  getLogFilePath,
+  initLogFile,
+  logToFile,
+} from '@utils/debug';
 import { createBenchmarkPipeline } from '@lib/middleware/benchmark';
 import { wizardAbort, WizardError, registerCleanup } from '@utils/wizard-abort';
 import { formatScanReport, writeScanReport } from '@lib/yara-hooks';
@@ -209,6 +216,11 @@ export async function runProgram(
 ): Promise<void> {
   const boot = await bootstrapProgram(session, config, programConfig);
 
+  if (isOrchestratorEnabled(boot.wizardFlags)) {
+    getUI().log.info('Task-queue orchestrator enabled.');
+    return runOrchestrator(session, programConfig, boot);
+  }
+
   return runLinearProgram(session, config, programConfig, boot);
 }
 
@@ -445,6 +457,7 @@ async function runLinearProgram(
         timeoutMs: config.askTimeoutMs,
       });
 
+  getUI().log.step('Initializing Claude agent...');
   const agent = await initializeAgent(
     {
       workingDirectory: session.installDir,
@@ -466,6 +479,8 @@ async function runLinearProgram(
     },
     sessionToOptions(session),
   );
+  getUI().log.step(`Verbose logs: ${getLogFilePath()}`);
+  getUI().log.success("Agent initialized. Let's get cooking!");
 
   logToFile('[agent-runner] agent initialized');
 
diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
new file mode 100644
index 00000000..64a4bdab
--- /dev/null
+++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
@@ -0,0 +1,205 @@
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  agentRunTools,
+  buildRegistry,
+  parseAgentPrompt,
+  resolveTask,
+  type AgentPrompt,
+  type AgentRegistry,
+} from '../agent-prompt-loader';
+import { QueueStore } from '../queue';
+
+function tmpDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-'));
+}
+
+function registryOf(prompts: AgentPrompt[]): AgentRegistry {
+  return buildRegistry(
+    prompts.map((p) => ({ ...p, flow: 'test-flow' })),
+    'test-flow',
+  );
+}
+
+describe('parseAgentPrompt', () => {
+  const sample = `---
+type: instrument-events
+model: claude-sonnet-4-6     # cheapest model that succeeds
+skills: [instrument-events]
+allowedTools: [Read, Edit, Grep, Glob, Bash]
+disallowedTools: [enqueue_task]
+dependsOn: [init]
+---
+
+## Goal
+Add at least one capture call.
+`;
+
+  it('parses frontmatter scalars and inline arrays', () => {
+    const p = parseAgentPrompt(sample, 'fallback');
+    expect(p.type).toBe('instrument-events');
+    expect(p.model).toBe('claude-sonnet-4-6');
+    expect(p.skills).toEqual(['instrument-events']);
+    expect(p.allowedTools).toEqual(['Read', 'Edit', 'Grep', 'Glob', 'Bash']);
+    expect(p.disallowedTools).toEqual(['enqueue_task']);
+    expect(p.dependsOn).toEqual(['init']);
+  });
+
+  it('strips inline comments and keeps the body', () => {
+    const p = parseAgentPrompt(sample, 'fallback');
+    expect(p.model).not.toContain('#');
+    expect(p.body).toContain('## Goal');
+    expect(p.body).not.toContain('---');
+  });
+
+  it('falls back to the menu id when type is omitted', () => {
+    const p = parseAgentPrompt('---\nmodel: x\n---\nbody', 'install');
+    expect(p.type).toBe('install');
+  });
+
+  it('parses the flow from frontmatter', () => {
+    const p = parseAgentPrompt('---\nflow: audit\n---\nx', 'fix-events');
+    expect(p.flow).toBe('audit');
+  });
+
+  it('marks the seed from frontmatter; everything else is a task', () => {
+    expect(parseAgentPrompt('---\nseed: true\n---\nplan', 'planner').seed).toBe(
+      true,
+    );
+    expect(parseAgentPrompt('---\nmodel: x\n---\nbody', 'install').seed).toBe(
+      false,
+    );
+  });
+
+  it('defaults missing array fields to empty and model to undefined', () => {
+    const p = parseAgentPrompt('no frontmatter at all', 'stub');
+    expect(p.model).toBeUndefined();
+    expect(p.skills).toEqual([]);
+    expect(p.dependsOn).toEqual([]);
+    expect(p.body).toBe('no frontmatter at all');
+  });
+});
+
+describe('agentRunTools', () => {
+  it('MCP-qualifies orchestrator tools and passes native tools through', () => {
+    const p = parseAgentPrompt(
+      '---\nallowedTools: [Read, read_handoffs]\ndisallowedTools: [enqueue_task, complete_task, Bash]\n---\nx',
+      't',
+    );
+    const { allowedTools, disallowedTools } = agentRunTools(p);
+    expect(allowedTools).toEqual([
+      'Read',
+      'mcp__posthog-wizard__read_handoffs',
+    ]);
+    expect(disallowedTools).toEqual([
+      'mcp__posthog-wizard__enqueue_task',
+      'mcp__posthog-wizard__complete_task',
+      'Bash',
+    ]);
+  });
+});
+
+describe('buildRegistry', () => {
+  const prompt = (over: Partial<AgentPrompt>): AgentPrompt => ({
+    type: 'x',
+    seed: false,
+    skills: [],
+    allowedTools: [],
+    disallowedTools: [],
+    dependsOn: [],
+    body: 'b',
+    ...over,
+  });
+
+  it('scopes to one flow and keeps the seed out of the task types', () => {
+    const registry = buildRegistry(
+      [
+        prompt({ type: 'plan-audit', flow: 'audit', seed: true }),
+        prompt({ type: 'fix-events', flow: 'audit' }),
+        prompt({ type: 'install', flow: 'posthog-integration' }),
+        prompt({ type: 'example' }),
+      ],
+      'audit',
+    );
+    expect(registry.types).toEqual(['fix-events']);
+    expect(registry.seed?.type).toBe('plan-audit');
+    expect(registry.get('install')).toBeUndefined();
+    // A flowless prompt (e.g. the documentation example) joins no registry.
+    expect(registry.get('example')).toBeUndefined();
+  });
+});
+
+describe('resolveTask', () => {
+  let dir: string;
+  let store: QueueStore;
+
+  beforeEach(() => {
+    dir = tmpDir();
+    store = new QueueStore(dir, 'run-1');
+  });
+
+  afterEach(() => {
+    fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  const prompt: AgentPrompt = {
+    type: 'capture',
+    seed: false,
+    model: 'claude-haiku-4-5-20251001',
+    skills: ['instrument-events'],
+    allowedTools: ['Read', 'Edit'],
+    disallowedTools: ['enqueue_task'],
+    dependsOn: ['plan-capture'],
+    body: '## Goal\nInstrument the planned events.',
+  };
+
+  it('throws when no prompt is registered for the type', () => {
+    const registry = registryOf([]);
+    const task = { type: 'capture', dependsOn: [] } as never;
+    expect(() => resolveTask(registry, task, store)).toThrow(/capture/);
+  });
+
+  it('resolves model, tools, and skills from the prompt', () => {
+    const registry = registryOf([prompt]);
+    const task = store.enqueue({ type: 'capture' });
+    const resolved = resolveTask(registry, task, store);
+    expect(resolved.model).toBe('claude-haiku-4-5-20251001');
+    expect(resolved.skills).toEqual(['instrument-events']);
+    expect(resolved.disallowedTools).toEqual([
+      'mcp__posthog-wizard__enqueue_task',
+    ]);
+  });
+
+  it('prefers the enqueue model override over the prompt model', () => {
+    const registry = registryOf([prompt]);
+    const task = store.enqueue({ type: 'capture', model: 'override-x' });
+    expect(resolveTask(registry, task, store).model).toBe('override-x');
+  });
+
+  it("appends upstream dependencies' handoffs as context", () => {
+    const registry = registryOf([prompt]);
+    const dep = store.enqueue({ type: 'plan-capture' });
+    store.complete(dep.id, {
+      goals: 'decide events',
+      did: 'picked signup and purchase',
+      forNextAgent: 'instrument those two',
+    });
+    const task = store.enqueue({
+      type: 'capture',
+      dependsOn: [dep.id],
+    });
+    const resolved = resolveTask(registry, task, store);
+    expect(resolved.prompt).toContain('Context from previous steps');
+    expect(resolved.prompt).toContain('picked signup and purchase');
+    expect(resolved.prompt).toContain('instrument those two');
+  });
+
+  it('omits the context section when there are no handoffs', () => {
+    const registry = registryOf([prompt]);
+    const task = store.enqueue({ type: 'capture' });
+    expect(resolveTask(registry, task, store).prompt).not.toContain(
+      'Context from previous steps',
+    );
+  });
+});
diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts
new file mode 100644
index 00000000..ee351db8
--- /dev/null
+++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts
@@ -0,0 +1,310 @@
+/**
+ * Agent-prompt loader + registry.
+ *
+ * Agent prompts are the WHAT of a task: a markdown file per type, served from
+ * context-mill as the `agents` content type (parallel to skills). The frontmatter
+ * carries the artifacts the executor needs — model, the mini-skills to load (the
+ * HOW), the tools the task may use, and its dependencies — and the body is the
+ * instruction the agent reads.
+ *
+ * The registry is fetched once at startup and scoped to one flow — agents
+ * declare `flow` and (for the planner) `seed: true` in frontmatter, so each
+ * program (integration, audit, migration, ...) ships its own agent set and the
+ * loader stays generic. Every prompt is downloaded and parsed up front, so
+ * resolving a task to its run config is synchronous and adds no mid-drain
+ * network latency. The registry's type list also drives `enqueue_task`
+ * validation.
+ */
+import type { QueueStore, QueuedTask } from './queue';
+import type { ResolvedTask } from './executor';
+
+/**
+ * The basics the client injects around every agent-prompt body. The `/agents/`
+ * files carry intent only (goal, success criteria); the wizard owns the I/O
+ * contract — who the agent is, how it reports, how it surfaces progress — so the
+ * authored prompts never restate it.
+ */
+export interface OrchestratorPromptContext {
+  projectId: number;
+  projectApiKey: string;
+  host: string;
+  /** Path to the framework's reference implementation (EXAMPLE.md), if available. */
+  examplePath?: string;
+  /** Path to the framework's rules (COMMANDMENTS.md), if available. */
+  commandmentsPath?: string;
+}
+
+function projectContext(ctx: OrchestratorPromptContext): string {
+  return `You have access to the PostHog MCP server and the wizard tools.
+
+Project context:
+- PostHog Project ID: ${ctx.projectId}
+- PostHog public token: ${ctx.projectApiKey}
+- PostHog Host: ${ctx.host}`;
+}
+
+/** Points the agent at the framework's reference integration to learn patterns from. */
+function exampleReference(ctx: OrchestratorPromptContext): string | null {
+  if (!ctx.examplePath) return null;
+  return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`;
+}
+
+/** The framework's rules ship with the reference skill; every task follows them. */
+function commandmentsReference(ctx: OrchestratorPromptContext): string | null {
+  if (!ctx.commandmentsPath) return null;
+  return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`;
+}
+
+const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`;
+
+const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`;
+
+/** A task agent's full prompt: injected basics, then the authored intent. */
+export function assembleTaskPrompt(
+  ctx: OrchestratorPromptContext,
+  body: string,
+): string {
+  return [
+    projectContext(ctx),
+    exampleReference(ctx),
+    commandmentsReference(ctx),
+    TASK_BASICS,
+    body,
+  ]
+    .filter(Boolean)
+    .join('\n\n');
+}
+
+/** The seed agent's full prompt: injected basics, then the authored intent. */
+export function assembleSeedPrompt(
+  ctx: OrchestratorPromptContext,
+  body: string,
+): string {
+  return [projectContext(ctx), SEED_BASICS, body].join('\n\n');
+}
+
+/** Used when neither the enqueue call nor the prompt frontmatter names a model. */
+const DEFAULT_TASK_MODEL = 'claude-sonnet-4-6';
+
+/** Orchestrator tools are MCP tools under the `posthog-wizard` server. Frontmatter
+ *  names them short (e.g. `enqueue_task`); the SDK gates on the full name. */
+const ORCHESTRATOR_TOOL_PREFIX = 'mcp__posthog-wizard__';
+const ORCHESTRATOR_TOOLS = new Set([
+  'enqueue_task',
+  'complete_task',
+  'read_handoffs',
+]);
+
+/** A parsed agent prompt. The frontmatter fields plus the markdown body. */
+export interface AgentPrompt {
+  type: string;
+  /** Human-readable title for the TUI; falls back to `type` when absent. */
+  label?: string;
+  /** The flow this agent belongs to (the program id, e.g. \`posthog-integration\`). */
+  flow?: string;
+  /** Marks the flow's planner: it seeds the queue and is not an enqueueable task. */
+  seed: boolean;
+  model?: string;
+  skills: string[];
+  allowedTools: string[];
+  disallowedTools: string[];
+  dependsOn: string[];
+  body: string;
+}
+
+export interface AgentRegistry {
+  /** The flow's enqueueable task types — every prompt except the seed. */
+  readonly types: string[];
+  /** The flow's planner, the one prompt marked `seed: true` in its frontmatter. */
+  readonly seed?: AgentPrompt;
+  get(type: string): AgentPrompt | undefined;
+}
+
+/** The registry for one flow's prompts. Pure; the loader feeds it the fetched set. */
+export function buildRegistry(
+  prompts: readonly AgentPrompt[],
+  flow: string,
+): AgentRegistry {
+  const inFlow = prompts.filter((p) => p.flow === flow);
+  const byType = new Map(inFlow.map((p) => [p.type, p]));
+  return {
+    types: inFlow.filter((p) => !p.seed).map((p) => p.type),
+    seed: inFlow.find((p) => p.seed),
+    get: (type) => byType.get(type),
+  };
+}
+
+interface AgentMenu {
+  agents: { id: string; downloadUrl: string }[];
+}
+
+/** A native tool passes through; an orchestrator tool gets its MCP-qualified name. */
+function expandToolName(name: string): string {
+  return ORCHESTRATOR_TOOLS.has(name)
+    ? `${ORCHESTRATOR_TOOL_PREFIX}${name}`
+    : name;
+}
+
+/** A prompt's allow/disallow lists with orchestrator tool names MCP-qualified. */
+export function agentRunTools(prompt: AgentPrompt): {
+  allowedTools: string[];
+  disallowedTools: string[];
+} {
+  return {
+    allowedTools: prompt.allowedTools.map(expandToolName),
+    disallowedTools: prompt.disallowedTools.map(expandToolName),
+  };
+}
+
+function toStringArray(value: unknown): string[] {
+  if (!Array.isArray(value)) return [];
+  return value.filter((v): v is string => typeof v === 'string');
+}
+
+/**
+ * Parse the leading `---` frontmatter block and the markdown body. The
+ * frontmatter is a small, known schema (scalars and inline `[a, b]` arrays), so
+ * a tiny parser covers it without a YAML dependency. Inline `# comments` after a
+ * value are stripped. `fallbackType` is the menu id, used when the body omits
+ * `type:`.
+ */
+export function parseAgentPrompt(
+  text: string,
+  fallbackType: string,
+): AgentPrompt {
+  const match = text.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?([\s\S]*)$/);
+  const frontmatter = match ? match[1] : '';
+  const body = (match ? match[2] : text).trim();
+
+  const fields: Record<string, unknown> = {};
+  for (const rawLine of frontmatter.split(/\r?\n/)) {
+    const line = rawLine.replace(/\s+#.*$/, '').trim();
+    if (!line || line.startsWith('#')) continue;
+    const kv = line.match(/^([\w-]+):\s*(.*)$/);
+    if (!kv) continue;
+    const [, key, raw] = kv;
+    if (raw.startsWith('[') && raw.endsWith(']')) {
+      fields[key] = raw
+        .slice(1, -1)
+        .split(',')
+        .map((s) => s.trim().replace(/^['"]|['"]$/g, ''))
+        .filter(Boolean);
+    } else {
+      fields[key] = raw.replace(/^['"]|['"]$/g, '');
+    }
+  }
+
+  const model = typeof fields.model === 'string' ? fields.model : undefined;
+  return {
+    type: typeof fields.type === 'string' ? fields.type : fallbackType,
+    label: typeof fields.label === 'string' ? fields.label : undefined,
+    flow: typeof fields.flow === 'string' ? fields.flow : undefined,
+    seed: fields.seed === 'true',
+    model,
+    skills: toStringArray(fields.skills),
+    allowedTools: toStringArray(fields.allowedTools),
+    disallowedTools: toStringArray(fields.disallowedTools),
+    dependsOn: toStringArray(fields.dependsOn),
+    body,
+  };
+}
+
+async function fetchText(url: string): Promise<string> {
+  const res = await fetch(url);
+  if (!res.ok) {
+    throw new Error(`Fetch ${url} failed: ${res.status} ${res.statusText}`);
+  }
+  return res.text();
+}
+
+/**
+ * Fetch the agent menu and every agent prompt it lists, parse them, and build
+ * the registry for one flow. Throws if the menu cannot be fetched — the
+ * orchestrator cannot run without its prompts.
+ */
+export async function loadAgentRegistry(
+  skillsBaseUrl: string,
+  flow: string,
+): Promise<AgentRegistry> {
+  const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`);
+  const menu = JSON.parse(menuRaw) as AgentMenu;
+
+  const prompts = await Promise.all(
+    (menu.agents ?? []).map(async (entry) => {
+      const text = await fetchText(entry.downloadUrl);
+      return parseAgentPrompt(text, entry.id);
+    }),
+  );
+
+  return buildRegistry(prompts, flow);
+}
+
+/**
+ * Render a task's own inputs into a section, so a fanned-out task (e.g. one
+ * `capture` per event) sees the specific thing it owns. Empty when there are none.
+ */
+function renderInputs(task: QueuedTask): string {
+  const entries = Object.entries(task.inputs ?? {});
+  if (entries.length === 0) return '';
+  const lines = entries.map(([k, v]) => `- ${k}: ${formatInputValue(v)}`);
+  return `## Your task input\n\n${lines.join('\n')}`;
+}
+
+function formatInputValue(value: unknown): string {
+  if (typeof value === 'string') return value;
+  return JSON.stringify(value);
+}
+
+/**
+ * Render the handoffs of a task's completed dependencies into a context section,
+ * so a fresh agent sees what the upstream steps did. Empty when there are none.
+ */
+function renderHandoffContext(task: QueuedTask, store: QueueStore): string {
+  const lines: string[] = [];
+  for (const depId of task.dependsOn) {
+    const dep = store.get(depId);
+    const handoff = store.readHandoff(depId);
+    if (!dep || !handoff) continue;
+    lines.push(`### ${dep.type}`);
+    lines.push(`- did: ${handoff.did}`);
+    lines.push(`- for you: ${handoff.forNextAgent}`);
+    if (handoff.filesTouched?.length) {
+      lines.push(`- files: ${handoff.filesTouched.join(', ')}`);
+    }
+    lines.push('');
+  }
+  if (lines.length === 0) return '';
+  return `## Context from previous steps\n\n${lines.join('\n')}`.trim();
+}
+
+/**
+ * Resolve a queued task to its run config: the prompt body (with upstream
+ * handoffs appended), the model, and the tool lists with orchestrator tool names
+ * MCP-qualified. The model precedence is enqueue override, then prompt, then
+ * default. Throws if no prompt is registered for the task's type.
+ */
+export function resolveTask(
+  registry: AgentRegistry,
+  task: QueuedTask,
+  store: QueueStore,
+): ResolvedTask {
+  const prompt = registry.get(task.type);
+  if (!prompt) {
+    throw new Error(`No agent prompt registered for task type "${task.type}"`);
+  }
+
+  const body = [
+    renderInputs(task),
+    prompt.body,
+    renderHandoffContext(task, store),
+  ]
+    .filter(Boolean)
+    .join('\n\n');
+
+  return {
+    model: task.model ?? prompt.model ?? DEFAULT_TASK_MODEL,
+    ...agentRunTools(prompt),
+    prompt: body,
+    skills: prompt.skills,
+  };
+}
diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts
new file mode 100644
index 00000000..978a8f31
--- /dev/null
+++ b/src/lib/programs/orchestrator/orchestrator-runner.ts
@@ -0,0 +1,296 @@
+/**
+ * Experimental task-queue orchestrator runner.
+ *
+ * Branches from the linear runner when the `wizard-orchestrator` flag is on. An
+ * orchestrator agent inspects the repo and seeds an in-memory task queue; an
+ * executor drains it, running one fresh agent per task.
+ *
+ * Both the WHAT (agent prompts: model, goal, success criteria, tools) and the
+ * HOW (mini-skills) are markdown served from context-mill — the seed and every
+ * task resolve to a prompt fetched at startup into the registry. The wizard side
+ * stays product-ignorant: it is the queue, the executor, and the loader.
+ */
+import { randomUUID } from 'crypto';
+import { existsSync } from 'fs';
+import * as path from 'path';
+import {
+  initializeAgent,
+  runAgent,
+  type AgentConfig,
+} from '../../agent/agent-interface';
+import { OutroKind, type WizardSession } from '../../wizard-session';
+import { detectNodePackageManagers } from '../../detection/package-manager';
+import { installSkillById } from '../../wizard-tools';
+import { getUI } from '../../../ui';
+import { analytics } from '../../../utils/analytics';
+import { logToFile } from '../../../utils/debug';
+import type { ProgramConfig } from '../program-step';
+import type { BootstrapResult } from '../../agent/agent-runner';
+import type { WizardRunOptions } from '../../../utils/types';
+import { QueueStore, QUEUE_DIR_NAME, TaskStatus } from './queue';
+import { drainQueue, type RunTask } from './executor';
+import {
+  agentRunTools,
+  assembleSeedPrompt,
+  assembleTaskPrompt,
+  loadAgentRegistry,
+  resolveTask,
+  type OrchestratorPromptContext,
+} from './agent-prompt-loader';
+
+function toTodoStatus(status: TaskStatus): string {
+  switch (status) {
+    case TaskStatus.Running:
+      return 'in_progress';
+    case TaskStatus.Done:
+    case TaskStatus.Failed:
+      return 'completed';
+    case TaskStatus.Skipped:
+      return 'skipped';
+    default:
+      return 'pending';
+  }
+}
+
+function sessionRunOptions(session: WizardSession): WizardRunOptions {
+  return {
+    installDir: session.installDir,
+    debug: session.debug,
+    default: false,
+    signup: session.signup,
+    localMcp: session.localMcp,
+    ci: session.ci,
+    benchmark: session.benchmark,
+    projectId: session.projectId,
+    apiKey: session.apiKey,
+    yaraReport: session.yaraReport,
+  };
+}
+
+export async function runOrchestrator(
+  session: WizardSession,
+  programConfig: ProgramConfig,
+  boot: BootstrapResult,
+): Promise<void> {
+  const runId = randomUUID();
+  const store = new QueueStore(session.installDir, runId);
+
+  const options = sessionRunOptions(session);
+
+  // The WHAT (agent prompts) is served from context-mill. Fetch the registry
+  // once up front: its types drive enqueue validation, and resolving a task to
+  // its run config is then synchronous, with no mid-drain network latency.
+  const registry = await loadAgentRegistry(
+    boot.skillsBaseUrl,
+    programConfig.id,
+  );
+  const seedPrompt = registry.seed;
+  if (!seedPrompt) {
+    throw new Error(
+      `No seed agent prompt (frontmatter \`seed: true\`) for flow "${programConfig.id}" is available from ${boot.skillsBaseUrl}.`,
+    );
+  }
+
+  // Give task agents the framework's finished reference integration to match,
+  // the same EXAMPLE.md the linear flow uses. Install it under the run dir rather
+  // than .claude/skills so its "do everything" workflow is not auto-loaded as a
+  // skill — only the example file is read, when the agent's prompt points at it.
+  let examplePath: string | undefined;
+  let commandmentsPath: string | undefined;
+  if (session.skillId) {
+    const ref = await installSkillById(
+      session.skillId,
+      session.installDir,
+      boot.skillsBaseUrl,
+      path.join(QUEUE_DIR_NAME, 'reference'),
+    );
+    if (ref.kind === 'ok') {
+      const example = path.join(ref.path, 'references', 'EXAMPLE.md');
+      if (existsSync(path.join(session.installDir, example))) {
+        examplePath = example;
+      }
+      const commandments = path.join(ref.path, 'references', 'COMMANDMENTS.md');
+      if (existsSync(path.join(session.installDir, commandments))) {
+        commandmentsPath = commandments;
+      }
+    } else {
+      logToFile(`[orchestrator] reference example unavailable: ${ref.kind}`);
+    }
+  }
+
+  // The client injects the basics (project context + the I/O contract) around
+  // every authored agent-prompt body.
+  const promptContext: OrchestratorPromptContext = {
+    projectId: boot.projectId,
+    projectApiKey: boot.projectApiKey,
+    host: boot.host,
+    examplePath,
+    commandmentsPath,
+  };
+
+  logToFile(
+    `[orchestrator] START program=${programConfig.id} dir=${session.installDir} run=${runId}`,
+  );
+  analytics.wizardCapture('orchestrator started', {
+    program_id: programConfig.id,
+  });
+  getUI().startRun();
+
+  // Label precedence: what the orchestrator set at enqueue, then the agent
+  // prompt's default, then the bare type.
+  const labelFor = (t: { type: string; label?: string }) =>
+    t.label ?? registry.get(t.type)?.label ?? t.type;
+  const renderQueue = () =>
+    getUI().syncTodos(
+      store.list().map((t) => ({
+        content: labelFor(t),
+        status: toTodoStatus(t.status),
+        activeForm: labelFor(t),
+      })),
+    );
+
+  // Each agent gets its own config so its wizard-tools server is bound to the
+  // task it runs — independent tasks run in parallel, and attribution of
+  // complete_task / enqueue_task must hold per agent. The seed is not a task,
+  // so its context has no task id.
+  const agentConfigFor = (currentTaskId?: string): AgentConfig => ({
+    workingDirectory: session.installDir,
+    posthogMcpUrl: boot.mcpUrl,
+    posthogApiKey: boot.accessToken,
+    posthogApiHost: boot.host,
+    detectPackageManager: detectNodePackageManagers,
+    skillsBaseUrl: boot.skillsBaseUrl,
+    wizardFlags: boot.wizardFlags,
+    // Tag agent events as orchestrator so telemetry segments from the baseline.
+    wizardMetadata: { ...boot.wizardMetadata, VARIANT: 'orchestrator' },
+    integrationLabel: programConfig.id,
+    orchestrator: {
+      store,
+      validTypes: registry.types,
+      currentTaskId,
+    },
+  });
+
+  const spinner = getUI().spinner();
+
+  // 1. Seed the queue with the orchestrator agent. It is itself an agent prompt
+  // (the WHAT), so its model and tools come from its frontmatter. The seed
+  // plans the graph, it is not a task.
+  const seedAgent = await initializeAgent(agentConfigFor(), options);
+  const seedResult = await runAgent(
+    {
+      ...seedAgent,
+      model: seedPrompt.model ?? seedAgent.model,
+      ...agentRunTools(seedPrompt),
+    },
+    assembleSeedPrompt(promptContext, seedPrompt.body),
+    options,
+    spinner,
+    {
+      spinnerMessage: 'Planning the integration...',
+      successMessage: 'Planned the integration',
+      additionalFeatureQueue: [],
+      requestRemark: false,
+    },
+  );
+  if (seedResult.error) {
+    logToFile(
+      `[orchestrator] seed error: ${seedResult.error} ${
+        seedResult.message ?? ''
+      }`,
+    );
+  }
+  analytics.wizardCapture('orchestrator seeded', {
+    task_count: store.list().length,
+    types: store.list().map((t) => t.type),
+  });
+  renderQueue();
+
+  // 2. Drain the queue, one fresh agent per task; independent tasks run in
+  // parallel, the seed's graph being the only schedule. Each task resolves to
+  // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then
+  // runs on its own model and tools.
+  const runTask: RunTask = async (task) => {
+    renderQueue();
+    try {
+      const resolved = resolveTask(registry, task, store);
+      const agent = await initializeAgent(agentConfigFor(task.id), options);
+      for (const skillId of resolved.skills) {
+        const result = await installSkillById(
+          skillId,
+          session.installDir,
+          boot.skillsBaseUrl,
+        );
+        if (result.kind !== 'ok') {
+          logToFile(
+            `[orchestrator] skill install failed type=${task.type} skill=${skillId} ${result.kind}`,
+          );
+        }
+      }
+      await runAgent(
+        {
+          ...agent,
+          model: resolved.model,
+          allowedTools: resolved.allowedTools,
+          disallowedTools: resolved.disallowedTools,
+        },
+        assembleTaskPrompt(promptContext, resolved.prompt),
+        options,
+        spinner,
+        // Empty messages suppress the per-task spinner lines (the spinner renders
+        // only when a message is set); the queue panel shows progress. Errors
+        // still surface — runAgent stops the spinner with its own error text.
+        // No per-task remark — the reflection would fire on every task.
+        {
+          spinnerMessage: '',
+          successMessage: '',
+          additionalFeatureQueue: [],
+          requestRemark: false,
+        },
+      );
+    } finally {
+      renderQueue();
+    }
+  };
+  await drainQueue(store, runTask);
+
+  renderQueue();
+
+  const summary = store.summary();
+  logToFile(
+    `[orchestrator] DONE done=${summary.done} failed=${summary.failed} total=${summary.total}`,
+  );
+  analytics.wizardCapture('orchestrator run finished', {
+    tasks_total: summary.total,
+    tasks_done: summary.done,
+    tasks_failed: summary.failed,
+  });
+
+  // The build step flags any unresolved conflict in its handoff; surface the
+  // one-liner here and point the user at the report for the detail.
+  const buildTask = store.list().find((t) => t.type === 'build');
+  const conflict = buildTask
+    ? store.readHandoff(buildTask.id)?.conflict
+    : undefined;
+
+  // Prefer the report the run wrote; fall back to the raw queue if it is missing.
+  const reportPath = path.join(session.installDir, 'posthog-setup-report.md');
+  const reportFile = existsSync(reportPath)
+    ? 'posthog-setup-report.md'
+    : store.queuePath;
+
+  const message = conflict
+    ? 'PostHog set up, with one conflict to review.'
+    : `PostHog set up: ${summary.done}/${summary.total} steps completed.`;
+  getUI().setOutroData({
+    kind: OutroKind.Success,
+    message,
+    body: conflict
+      ? `⚠ Build conflict: ${conflict}\nFull details are in the report.`
+      : undefined,
+    reportFile,
+    docsUrl: 'https://posthog.com/docs/ai-engineering/ai-wizard',
+  });
+  getUI().outro(message);
+  await analytics.shutdown('success');
+}
diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts
index 6d3aad52..64e5bc93 100644
--- a/src/lib/programs/orchestrator/queue-tools.ts
+++ b/src/lib/programs/orchestrator/queue-tools.ts
@@ -29,6 +29,7 @@ export interface OrchestratorToolsContext {
 
 export interface EnqueueArgs {
   type: string;
+  label?: string;
   inputs?: Record<string, unknown>;
   dependsOn?: string[];
   model?: string;
@@ -115,6 +116,7 @@ export function applyEnqueue(
 
   const task = ctx.store.enqueue({
     type: args.type,
+    label: args.label,
     inputs: args.inputs ?? {},
     dependsOn: args.dependsOn ?? [],
     model: args.model,
@@ -175,6 +177,12 @@ const HANDOFF_SHAPE = {
   did: z.string().describe('What you actually did.'),
   forNextAgent: z.string().describe('What the next agent should know.'),
   filesTouched: z.array(z.string()).optional(),
+  conflict: z
+    .string()
+    .optional()
+    .describe(
+      'A one-line summary of any conflict you could not cleanly resolve (e.g. a dependency or build conflict). Put full detail in your work; this line is surfaced to the user.',
+    ),
 };
 
 type SdkTool = (
@@ -204,6 +212,12 @@ export function buildOrchestratorTools(
       type: z
         .string()
         .describe(`The task type. One of: ${ctx.validTypes.join(', ')}.`),
+      label: z
+        .string()
+        .optional()
+        .describe(
+          'A short label for the UI — the action in a few words (e.g. "Add the PostHog SDK", "Initialize PostHog"). Leave out file names, class names, and other specifics.',
+        ),
       inputs: z.record(z.unknown()).optional(),
       dependsOn: z
         .array(z.string())
diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts
index 5f62c718..4ecc3cb5 100644
--- a/src/lib/programs/orchestrator/queue.ts
+++ b/src/lib/programs/orchestrator/queue.ts
@@ -28,6 +28,8 @@ export type TaskStatus = (typeof TaskStatus)[keyof typeof TaskStatus];
 export interface QueuedTask {
   id: string;
   type: string;
+  /** Human-readable label for the TUI, set by the enqueuing agent. */
+  label?: string;
   status: TaskStatus;
   dependsOn: string[];
   inputs: Record<string, unknown>;
@@ -56,10 +58,13 @@ export interface TaskHandoff {
   did: string;
   forNextAgent: string;
   filesTouched?: string[];
+  /** A one-line summary of any unresolved conflict, surfaced in the outro. */
+  conflict?: string;
 }
 
 export interface EnqueueInput {
   type: string;
+  label?: string;
   inputs?: Record<string, unknown>;
   dependsOn?: string[];
   model?: string;
@@ -155,6 +160,7 @@ export class QueueStore {
     const task: QueuedTask = {
       id: randomUUID(),
       type: input.type,
+      label: input.label,
       status: TaskStatus.Pending,
       dependsOn: input.dependsOn ?? [],
       inputs: input.inputs ?? {},
diff --git a/src/lib/task-stream/task-stream-push.ts b/src/lib/task-stream/task-stream-push.ts
index cecd9ff8..02815419 100644
--- a/src/lib/task-stream/task-stream-push.ts
+++ b/src/lib/task-stream/task-stream-push.ts
@@ -37,6 +37,8 @@ const STATUS_MAP: Record<TaskStatus, StreamTaskStatus> = {
   [TaskStatus.Pending]: StreamTaskStatus.Pending,
   [TaskStatus.InProgress]: StreamTaskStatus.InProgress,
   [TaskStatus.Completed]: StreamTaskStatus.Completed,
+  // The stream has no skipped state; skipped is terminal, so report it resolved.
+  [TaskStatus.Skipped]: StreamTaskStatus.Completed,
 };
 
 function buildTasks(items: TaskItem[]): StreamTask[] {
diff --git a/src/ui/logging-ui.ts b/src/ui/logging-ui.ts
index fd0c34f1..9ae0a2ee 100644
--- a/src/ui/logging-ui.ts
+++ b/src/ui/logging-ui.ts
@@ -232,20 +232,22 @@ export class LoggingUI implements WizardUI {
     // the session.
   }
 
+  private lastTodoLine = '';
+
   syncTodos(
     todos: Array<{ content: string; status: string; activeForm?: string }>,
   ): void {
     const completed = todos.filter(
       (t) => t.status === TaskStatus.Completed,
     ).length;
-    const inProgress = todos.find((t) => t.status === TaskStatus.InProgress);
-    if (inProgress) {
-      console.log(
-        `◌  [${completed}/${todos.length}] ${
-          inProgress.activeForm || inProgress.content
-        }`,
-      );
-    }
+    const active = todos.filter((t) => t.status === TaskStatus.InProgress);
+    if (active.length === 0) return;
+    const labels = active.map((t) => t.activeForm || t.content).join(' · ');
+    const line = `◌  [${completed}/${todos.length}] ${labels}`;
+    // The queue re-renders on every transition; print only what changed.
+    if (line === this.lastTodoLine) return;
+    this.lastTodoLine = line;
+    console.log(line);
   }
 
   setEventPlan(_events: Array<{ name: string; description: string }>): void {
diff --git a/src/ui/tui/primitives/ProgressList.tsx b/src/ui/tui/primitives/ProgressList.tsx
index b72156c8..3c84c8ee 100644
--- a/src/ui/tui/primitives/ProgressList.tsx
+++ b/src/ui/tui/primitives/ProgressList.tsx
@@ -11,7 +11,7 @@ import { LoadingBox } from './LoadingBox.js';
 export interface ProgressItem {
   label: string;
   activeForm?: string;
-  status: 'pending' | 'in_progress' | 'completed';
+  status: 'pending' | 'in_progress' | 'completed' | 'skipped';
 }
 
 interface ProgressListProps {
@@ -20,7 +20,9 @@ interface ProgressListProps {
 }
 
 export const ProgressList = ({ items, title }: ProgressListProps) => {
-  const completed = items.filter((t) => t.status === 'completed').length;
+  const resolved = items.filter(
+    (t) => t.status === 'completed' || t.status === 'skipped',
+  ).length;
   const total = items.length;
 
   return (
@@ -33,6 +35,7 @@ export const ProgressList = ({ items, title }: ProgressListProps) => {
       )}
       {items.length === 0 && <LoadingBox message="Analyzing project..." />}
       {items.map((item, i) => {
+        const skipped = item.status === 'skipped';
         const icon =
           item.status === 'completed'
             ? Icons.squareFilled
@@ -45,15 +48,22 @@ export const ProgressList = ({ items, title }: ProgressListProps) => {
             : item.status === 'in_progress'
             ? Colors.primary
             : Colors.muted;
-        const label =
-          item.status === 'in_progress' && item.activeForm
-            ? item.activeForm
-            : item.label;
+        const label = skipped
+          ? `${item.label} (skipped)`
+          : item.status === 'in_progress' && item.activeForm
+          ? item.activeForm
+          : item.label;
 
         return (
           <Text key={i}>
             <Text color={color}>{icon}</Text>
-            <Text dimColor={item.status === 'pending'}> {label}</Text>
+            <Text
+              dimColor={item.status === 'pending' || skipped}
+              strikethrough={skipped}
+            >
+              {' '}
+              {label}
+            </Text>
           </Text>
         );
       })}
@@ -61,8 +71,8 @@ export const ProgressList = ({ items, title }: ProgressListProps) => {
         <Box marginTop={1} gap={1}>
           <Spinner />
           <Text dimColor>
-            {completed < total
-              ? `Progress: ${completed}/${total} completed`
+            {resolved < total
+              ? `Progress: ${resolved}/${total} completed`
               : 'Cleaning up...'}
           </Text>
         </Box>
diff --git a/src/ui/wizard-ui.ts b/src/ui/wizard-ui.ts
index 97216262..cbb494d4 100644
--- a/src/ui/wizard-ui.ts
+++ b/src/ui/wizard-ui.ts
@@ -21,6 +21,7 @@ export enum TaskStatus {
   Pending = 'pending',
   InProgress = 'in_progress',
   Completed = 'completed',
+  Skipped = 'skipped',
 }
 
 export function isTaskStatus(value: string): value is TaskStatus {

From f26d97840dca23cf4055c84e2f6990778be42d8a Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:22:34 -0400
Subject: [PATCH 06/12] feat(analytics): identify the user so feature flags can
 target by email (#620)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/lib/agent/agent-runner.ts |  3 ++
 src/utils/analytics.ts        | 58 ++++++++++++++++++++---------------
 2 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index 07a89c0c..a08320ed 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -349,6 +349,9 @@ async function bootstrapProgram(
   getUI().setRoleAtOrganization(roleAtOrganization);
   getUI().setApiUser(user);
 
+  // Identify the user (email, name) before evaluating flags, so flags can target
+  // the individual user and not just $app_name.
+  if (user) analytics.identifyUser(user);
   analytics.setGroups(groupsFromUser(user, host));
 
   // 4.5. AI opt-in enforcement. Parks here while AiOptInRequiredScreen is
diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts
index 48a0717b..bf849a7e 100644
--- a/src/utils/analytics.ts
+++ b/src/utils/analytics.ts
@@ -8,7 +8,7 @@ import type { WizardSession } from '@lib/wizard-session';
 import type { ApiUser } from '@lib/api';
 import { v4 as uuidv4 } from 'uuid';
 import { IS_PRODUCTION_BUILD } from '@env';
-import { debug } from './debug';
+import { debug, logToFile } from './debug';
 
 /**
  * Extract a standard property bag from the current session.
@@ -58,6 +58,7 @@ export class Analytics {
   private appName = 'wizard';
   private activeFlags: Record<string, string> | null = null;
   private groups: Record<string, string> = {};
+  private personProperties: Record<string, string> = {};
 
   constructor() {
     this.client = new PostHog(ANALYTICS_POSTHOG_PUBLIC_PROJECT_WRITE_KEY, {
@@ -107,10 +108,12 @@ export class Analytics {
   }
 
   /**
-   * Associate the run with the logged-in user, once per id: identify them
-   * (email, name), then alias the run's anonymous id onto the identified
-   * person so pre-login events merge in. Alias only ever fires after
-   * identification.
+   * Associate the run with the logged-in user, once per id. Identifies them
+   * (email, name) and records those person properties so events carry them and
+   * feature flags can target the individual user — without the email here the
+   * wizard only sends `$app_name`, so email-targeted flags never match. Opens
+   * the analytics session on first login, then aliases the run's anonymous id
+   * onto the identified person so pre-login events merge in.
    */
   identifyUser(user: ApiUser) {
     const distinctId = user.distinct_id;
@@ -127,25 +130,28 @@ export class Analytics {
       this.sessionId = uuidv4();
       this.tags.$session_id = this.sessionId;
     }
-    this.client.identify({
-      distinctId,
-      properties: {
-        $set: {
-          ...(user.email ? { email: user.email } : {}),
-          ...(user.first_name || user.last_name
-            ? {
-                name: [user.first_name, user.last_name]
-                  .filter(Boolean)
-                  .join(' '),
-              }
-            : {}),
-        },
-      },
-    });
+    const props: Record<string, string> = {};
+    if (user.email) props.email = user.email;
+    const name = [user.first_name, user.last_name]
+      .filter(Boolean)
+      .join(' ')
+      .trim();
+    if (name) props.name = name;
+    this.personProperties = props;
+    this.client.identify({ distinctId, properties: { $set: props } });
     this.client.alias({
       distinctId,
       alias: this.anonymousId,
     });
+    // The flag snapshot is per identity. Anything evaluated before login (the
+    // intro screen reads the tools-menu flag) was anonymous — drop it so the
+    // next read re-evaluates as this user.
+    this.activeFlags = null;
+  }
+
+  /** Person properties sent with flag evaluation: app name plus the user's. */
+  private flagPersonProperties(): Record<string, string> {
+    return { $app_name: this.appName, ...this.personProperties };
   }
 
   setTag(key: string, value: string | boolean | number | null | undefined) {
@@ -198,9 +204,7 @@ export class Analytics {
       const distinctId = this.distinctId ?? this.anonymousId;
       return await this.client.getFeatureFlag(flagKey, distinctId, {
         sendFeatureFlagEvents: true,
-        personProperties: {
-          $app_name: this.appName,
-        },
+        personProperties: this.flagPersonProperties(),
       });
     } catch (error) {
       debug('Failed to get feature flag:', flagKey, error);
@@ -219,8 +223,13 @@ export class Analytics {
     }
     try {
       const distinctId = this.distinctId ?? this.anonymousId;
+      logToFile('[flags] evaluating as', {
+        distinctId,
+        identified: this.distinctId !== undefined,
+        personProperties: this.flagPersonProperties(),
+      });
       const result = await this.client.getAllFlagsAndPayloads(distinctId, {
-        personProperties: { $app_name: this.appName },
+        personProperties: this.flagPersonProperties(),
       });
       const flags = result.featureFlags ?? {};
       const out: Record<string, string> = {};
@@ -229,6 +238,7 @@ export class Analytics {
         out[key] = typeof value === 'boolean' ? String(value) : String(value);
       }
       this.activeFlags = out;
+      logToFile('[flags] evaluated', out);
       return out;
     } catch (error) {
       debug('Failed to get all feature flags:', error);

From 1d7127d9dafe936a71667b898fb94d3ba6a2757b Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:22:43 -0400
Subject: [PATCH 07/12] feat(ci): flag overrides that exist only in CI builds
 (#635)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/smoke-test.sh                         | 33 +++++++++-
 src/env.ts                                    |  4 ++
 src/utils/__tests__/ci-flag-overrides.test.ts | 63 +++++++++++++++++++
 src/utils/analytics.ts                        | 17 +++--
 src/utils/ci-flag-overrides.ts                | 46 ++++++++++++++
 5 files changed, 157 insertions(+), 6 deletions(-)
 create mode 100644 src/utils/__tests__/ci-flag-overrides.test.ts
 create mode 100644 src/utils/ci-flag-overrides.ts

diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh
index 5abaaca9..1e9a690b 100755
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@@ -19,7 +19,38 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH
   exit 1
 }
 
-# ── 2. --ci rejected in production builds ────────────────────────────────────
+# ── 2. CI flag overrides physically absent from production builds ───────────
+# The override path (src/utils/ci-flag-overrides.ts) is dead code in published
+# builds and tsdown strips it; its env var name appearing in dist/*.js means
+# dead-code elimination regressed and a prod surface leaked. Sourcemaps keep
+# the original source, so only .js output counts.
+OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES'
+if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
+  # CI builds must keep the path — its absence means the override silently
+  # stopped working and CI is back to testing live flags.
+  if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
+    echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2
+    exit 1
+  fi
+  # And a real invocation must accept the env var. yargs claims every
+  # POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects
+  # unknown ones during command parse (--version/--help short-circuit and
+  # prove nothing). The run exits fast on the missing api key — all this
+  # asserts is that yargs did not reject the environment.
+  ci_probe=$(WIZARD_CI_FLAG_OVERRIDES='{"wizard-orchestrator":true}' node "$DIST_BIN" --ci --install-dir /tmp/wizard-smoke-probe 2>&1) || true
+  if echo "$ci_probe" | grep -q 'Unknown argument'; then
+    echo 'Smoke test failed: CI binary rejects WIZARD_CI_FLAG_OVERRIDES in the environment' >&2
+    echo "$ci_probe" | head -3 >&2
+    exit 1
+  fi
+else
+  if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
+    echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2
+    exit 1
+  fi
+fi
+
+# ── 3. --ci rejected in production builds ────────────────────────────────────
 # build:ci sets WIZARD_BUILD_NODE_ENV=ci → --ci stays enabled → skip the check.
 if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
   exit 0
diff --git a/src/env.ts b/src/env.ts
index 6eec7cad..c32e886a 100644
--- a/src/env.ts
+++ b/src/env.ts
@@ -39,6 +39,10 @@ export const IS_PRODUCTION_BUILD = process.env.NODE_ENV === 'production';
  * Add new keys here when a new runtime dependency is needed.
  */
 type RuntimeEnvKey =
+  // CI-build-only flag overrides (see utils/ci-flag-overrides.ts).
+  // Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD')
+  // would claim it as an unknown CLI option and strict-reject the run.
+  | 'WIZARD_CI_FLAG_OVERRIDES'
   // Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix)
   | 'POSTHOG_WIZARD_BENCHMARK_CONFIG'
   | 'POSTHOG_WIZARD_BENCHMARK_FILE'
diff --git a/src/utils/__tests__/ci-flag-overrides.test.ts b/src/utils/__tests__/ci-flag-overrides.test.ts
new file mode 100644
index 00000000..4d2333a1
--- /dev/null
+++ b/src/utils/__tests__/ci-flag-overrides.test.ts
@@ -0,0 +1,63 @@
+import { applyCiFlagOverrides } from '@utils/ci-flag-overrides';
+
+jest.mock('@utils/debug', () => ({
+  logToFile: jest.fn(),
+  debug: jest.fn(),
+}));
+
+const ENV_KEY = 'WIZARD_CI_FLAG_OVERRIDES';
+
+describe('applyCiFlagOverrides', () => {
+  afterEach(() => {
+    delete process.env[ENV_KEY];
+  });
+
+  // Jest runs with NODE_ENV=test, so IS_PRODUCTION_BUILD is false and the
+  // override path is live — the same shape a `build:ci` bundle has.
+  describe('in CI builds', () => {
+    it('returns the flags untouched when no override is set', () => {
+      const flags = { 'wizard-orchestrator': 'false' };
+      expect(applyCiFlagOverrides(flags)).toEqual(flags);
+    });
+
+    it('merges overrides over the fetched flags, stringifying values', () => {
+      process.env[ENV_KEY] = JSON.stringify({
+        'wizard-orchestrator': true,
+        'wizard-next-v2': 'legacy',
+      });
+      expect(
+        applyCiFlagOverrides({
+          'wizard-orchestrator': 'false',
+          'wizard-react-router': 'true',
+        }),
+      ).toEqual({
+        'wizard-orchestrator': 'true',
+        'wizard-next-v2': 'legacy',
+        'wizard-react-router': 'true',
+      });
+    });
+
+    it('fails loudly on malformed JSON instead of testing live flags', () => {
+      process.env[ENV_KEY] = 'wizard-orchestrator=true';
+      expect(() => applyCiFlagOverrides({})).toThrow(/not valid JSON/);
+    });
+  });
+
+  describe('in production builds', () => {
+    it('is inert: overrides are ignored even when the env var is set', () => {
+      const prevNodeEnv = process.env.NODE_ENV;
+      process.env.NODE_ENV = 'production';
+      process.env[ENV_KEY] = JSON.stringify({ 'wizard-orchestrator': true });
+      let result: Record<string, string> | undefined;
+      jest.isolateModules(() => {
+        // eslint-disable-next-line @typescript-eslint/no-var-requires
+        const prod = require('@utils/ci-flag-overrides') as {
+          applyCiFlagOverrides: typeof applyCiFlagOverrides;
+        };
+        result = prod.applyCiFlagOverrides({ 'wizard-orchestrator': 'false' });
+      });
+      process.env.NODE_ENV = prevNodeEnv;
+      expect(result).toEqual({ 'wizard-orchestrator': 'false' });
+    });
+  });
+});
diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts
index bf849a7e..d034f949 100644
--- a/src/utils/analytics.ts
+++ b/src/utils/analytics.ts
@@ -9,6 +9,7 @@ import type { ApiUser } from '@lib/api';
 import { v4 as uuidv4 } from 'uuid';
 import { IS_PRODUCTION_BUILD } from '@env';
 import { debug, logToFile } from './debug';
+import { applyCiFlagOverrides } from './ci-flag-overrides';
 
 /**
  * Extract a standard property bag from the current session.
@@ -221,6 +222,7 @@ export class Analytics {
     if (this.activeFlags !== null) {
       return this.activeFlags;
     }
+    const out: Record<string, string> = {};
     try {
       const distinctId = this.distinctId ?? this.anonymousId;
       logToFile('[flags] evaluating as', {
@@ -232,18 +234,23 @@ export class Analytics {
         personProperties: this.flagPersonProperties(),
       });
       const flags = result.featureFlags ?? {};
-      const out: Record<string, string> = {};
       for (const [key, value] of Object.entries(flags)) {
         if (value === undefined) continue;
         out[key] = typeof value === 'boolean' ? String(value) : String(value);
       }
-      this.activeFlags = out;
-      logToFile('[flags] evaluated', out);
-      return out;
     } catch (error) {
       debug('Failed to get all feature flags:', error);
-      return {};
+      this.captureException(
+        error instanceof Error ? error : new Error(String(error)),
+        { step: 'get_all_flags' },
+      );
     }
+    // Outside the fetch guard on purpose: a malformed CI override must fail
+    // the run loudly, and a valid one applies even when the fetch failed —
+    // CI routing stays deterministic either way.
+    this.activeFlags = applyCiFlagOverrides(out);
+    logToFile('[flags] evaluated', this.activeFlags);
+    return this.activeFlags;
   }
 
   async shutdown(status: 'success' | 'error' | 'cancelled') {
diff --git a/src/utils/ci-flag-overrides.ts b/src/utils/ci-flag-overrides.ts
new file mode 100644
index 00000000..e8790e23
--- /dev/null
+++ b/src/utils/ci-flag-overrides.ts
@@ -0,0 +1,46 @@
+/**
+ * CI-only feature-flag overrides.
+ *
+ * CI must route deterministically: a run that tests the orchestrator arm says
+ * so explicitly instead of depending on a live feature flag someone can edit
+ * mid-week. `WIZARD_CI_FLAG_OVERRIDES` is a JSON object of flag key →
+ * value, merged over whatever PostHog returned.
+ *
+ * The override path exists only in CI builds (`pnpm build:ci`). Published
+ * builds inline NODE_ENV as the literal "production", the guard below
+ * collapses, and tsdown strips the rest from the bundle — and the smoke test
+ * asserts the env var's name is physically absent from production output, so
+ * this can never quietly become a production surface.
+ */
+import { runtimeEnv } from '@env';
+import { logToFile } from './debug';
+
+export function applyCiFlagOverrides(
+  flags: Record<string, string>,
+): Record<string, string> {
+  // Compared inline (not via env.ts's IS_PRODUCTION_BUILD) so tsdown replaces
+  // it with a literal right here and the bundler can prove the rest of this
+  // function unreachable in production builds. The smoke test enforces that.
+  if (process.env.NODE_ENV === 'production') return flags;
+
+  const raw = runtimeEnv('WIZARD_CI_FLAG_OVERRIDES');
+  if (!raw) return flags;
+
+  let overrides: Record<string, unknown>;
+  try {
+    overrides = JSON.parse(raw) as Record<string, unknown>;
+  } catch {
+    // A malformed override is a CI misconfiguration. Fail the run loudly
+    // rather than silently testing whatever the live flags happen to say.
+    throw new Error(
+      'WIZARD_CI_FLAG_OVERRIDES is not valid JSON (expected {"flag-key": value, ...}).',
+    );
+  }
+
+  const merged = { ...flags };
+  for (const [key, value] of Object.entries(overrides)) {
+    merged[key] = String(value);
+  }
+  logToFile('[flags] CI overrides applied', overrides);
+  return merged;
+}

From 3b69e46c78d02a08a7f734ff134b424723980d31 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:22:53 -0400
Subject: [PATCH 08/12] feat(orchestrator): task instructions are ephemeral,
 not keepable skills (#637)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../__tests__/agent-prompt-loader.test.ts     | 24 ++++++++++++++++++
 .../orchestrator/agent-prompt-loader.ts       | 13 ++++++++++
 .../orchestrator/orchestrator-runner.ts       | 25 ++++++++++++++++---
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
index 64a4bdab..8252e791 100644
--- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
+++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
@@ -3,11 +3,13 @@ import * as os from 'os';
 import * as path from 'path';
 import {
   agentRunTools,
+  assembleTaskPrompt,
   buildRegistry,
   parseAgentPrompt,
   resolveTask,
   type AgentPrompt,
   type AgentRegistry,
+  type OrchestratorPromptContext,
 } from '../agent-prompt-loader';
 import { QueueStore } from '../queue';
 
@@ -203,3 +205,25 @@ describe('resolveTask', () => {
     );
   });
 });
+
+describe('assembleTaskPrompt', () => {
+  const ctx: OrchestratorPromptContext = {
+    projectId: 1,
+    projectApiKey: 'phc_x',
+    host: 'https://us.posthog.com',
+  };
+
+  it('points the agent at its installed task instructions', () => {
+    const assembled = assembleTaskPrompt(ctx, 'do the task', [
+      '.posthog-wizard/skills/capture/SKILL.md',
+    ]);
+    expect(assembled).toContain('.posthog-wizard/skills/capture/SKILL.md');
+    expect(assembled).toContain('do the task');
+  });
+
+  it('omits the instructions section when no skills are installed', () => {
+    expect(assembleTaskPrompt(ctx, 'do the task')).not.toContain(
+      'task instructions',
+    );
+  });
+});
diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts
index ee351db8..902adaee 100644
--- a/src/lib/programs/orchestrator/agent-prompt-loader.ts
+++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts
@@ -49,6 +49,17 @@ function exampleReference(ctx: OrchestratorPromptContext): string | null {
   return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`;
 }
 
+/**
+ * Points the agent at its installed task instructions (the HOW). They live under
+ * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load
+ * them — the prompt has to name them.
+ */
+function skillReference(paths: readonly string[]): string | null {
+  if (paths.length === 0) return null;
+  const list = paths.map((p) => `\`${p}\``).join(', ');
+  return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`;
+}
+
 /** The framework's rules ship with the reference skill; every task follows them. */
 function commandmentsReference(ctx: OrchestratorPromptContext): string | null {
   if (!ctx.commandmentsPath) return null;
@@ -63,11 +74,13 @@ const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue
 export function assembleTaskPrompt(
   ctx: OrchestratorPromptContext,
   body: string,
+  skillPaths: readonly string[] = [],
 ): string {
   return [
     projectContext(ctx),
     exampleReference(ctx),
     commandmentsReference(ctx),
+    skillReference(skillPaths),
     TASK_BASICS,
     body,
   ]
diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts
index 978a8f31..8f2aac90 100644
--- a/src/lib/programs/orchestrator/orchestrator-runner.ts
+++ b/src/lib/programs/orchestrator/orchestrator-runner.ts
@@ -11,7 +11,7 @@
  * stays product-ignorant: it is the queue, the executor, and the loader.
  */
 import { randomUUID } from 'crypto';
-import { existsSync } from 'fs';
+import { existsSync, rmSync } from 'fs';
 import * as path from 'path';
 import {
   initializeAgent,
@@ -210,18 +210,27 @@ export async function runOrchestrator(
   // parallel, the seed's graph being the only schedule. Each task resolves to
   // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then
   // runs on its own model and tools.
+  const taskSkillsRoot = path.join(QUEUE_DIR_NAME, 'skills');
   const runTask: RunTask = async (task) => {
     renderQueue();
     try {
       const resolved = resolveTask(registry, task, store);
       const agent = await initializeAgent(agentConfigFor(task.id), options);
+      // Task instructions are one-run scaffolding, not durable skills, so they
+      // install under the run dir rather than .claude/skills — the SDK must not
+      // auto-load them and they must never land in the project (or a CI PR).
+      // The prompt points the agent at them instead.
+      const skillPaths: string[] = [];
       for (const skillId of resolved.skills) {
         const result = await installSkillById(
           skillId,
           session.installDir,
           boot.skillsBaseUrl,
+          taskSkillsRoot,
         );
-        if (result.kind !== 'ok') {
+        if (result.kind === 'ok') {
+          skillPaths.push(path.join(result.path, 'SKILL.md'));
+        } else {
           logToFile(
             `[orchestrator] skill install failed type=${task.type} skill=${skillId} ${result.kind}`,
           );
@@ -234,7 +243,7 @@ export async function runOrchestrator(
           allowedTools: resolved.allowedTools,
           disallowedTools: resolved.disallowedTools,
         },
-        assembleTaskPrompt(promptContext, resolved.prompt),
+        assembleTaskPrompt(promptContext, resolved.prompt, skillPaths),
         options,
         spinner,
         // Empty messages suppress the per-task spinner lines (the spinner renders
@@ -252,7 +261,15 @@ export async function runOrchestrator(
       renderQueue();
     }
   };
-  await drainQueue(store, runTask);
+  try {
+    await drainQueue(store, runTask);
+  } finally {
+    // Success or failure, the installed task instructions never outlive the run.
+    rmSync(path.join(session.installDir, taskSkillsRoot), {
+      recursive: true,
+      force: true,
+    });
+  }
 
   renderQueue();
 

From 80d6250b3a653374e94948481e7b708b36318fbe Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:23:03 -0400
Subject: [PATCH 09/12] =?UTF-8?q?feat(orchestrator):=20run=20telemetry=20?=
 =?UTF-8?q?=E2=80=94=20the=20responsiveness=20A/B=20spine=20(#638)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/lib/agent/agent-interface.ts              | 25 +++++
 src/lib/agent/agent-runner.ts                 |  3 +
 .../__tests__/agent-prompt-loader.test.ts     | 18 ++++
 .../orchestrator/__tests__/queue.test.ts      | 40 ++++++++
 .../orchestrator/agent-prompt-loader.ts       | 27 ++---
 .../orchestrator/orchestrator-runner.ts       | 99 ++++++++++++++++++-
 src/lib/programs/orchestrator/queue.ts        | 49 ++++++++-
 7 files changed, 245 insertions(+), 16 deletions(-)

diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts
index 07342385..f80e7857 100644
--- a/src/lib/agent/agent-interface.ts
+++ b/src/lib/agent/agent-interface.ts
@@ -678,6 +678,11 @@ export async function runAgent(
     abortCases?: readonly AbortCaseMatcher[];
     /** Request the end-of-run reflection remark. Defaults to true. */
     requestRemark?: boolean;
+    /**
+     * Extra properties attached to this run's `agent completed` / `agent
+     * aborted` events (e.g. the orchestrator's task type and id).
+     */
+    analyticsProperties?: Record<string, unknown>;
   },
   middleware?: {
     onMessage(message: any): void;
@@ -756,9 +761,27 @@ export async function runAgent(
       analytics.capture(WIZARD_REMARK_EVENT_NAME, { remark });
     }
 
+    // Token usage comes from the SDK result message and is per agent run —
+    // for the orchestrator that means per task, the secondary cost to watch.
+    const usage = lastResultMessage?.usage as
+      | {
+          input_tokens?: number;
+          output_tokens?: number;
+          cache_creation_input_tokens?: number;
+          cache_read_input_tokens?: number;
+        }
+      | undefined;
     analytics.wizardCapture('agent completed', {
       duration_ms: durationMs,
       duration_seconds: durationSeconds,
+      model: agentConfig.model,
+      num_turns: lastResultMessage?.num_turns,
+      total_cost_usd: lastResultMessage?.total_cost_usd,
+      input_tokens: usage?.input_tokens,
+      output_tokens: usage?.output_tokens,
+      cache_creation_input_tokens: usage?.cache_creation_input_tokens,
+      cache_read_input_tokens: usage?.cache_read_input_tokens,
+      ...config?.analyticsProperties,
     });
     try {
       middleware?.finalize(lastResultMessage, durationMs);
@@ -1172,6 +1195,8 @@ export async function runAgent(
       analytics.wizardCapture('agent aborted', {
         duration_ms: durationMs,
         duration_seconds: Math.round(durationMs / 1000),
+        model: agentConfig.model,
+        ...config?.analyticsProperties,
       });
     }
   }
diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index a08320ed..2d693058 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -368,6 +368,9 @@ async function bootstrapProgram(
   // fork decision reads the flags.
   const wizardFlags = await analytics.getAllFlagsForWizard();
   const wizardMetadata = buildWizardMetadata(wizardFlags);
+  // Tag every wizard event with the variant so runs segment in PostHog; the
+  // orchestrator arm overwrites this with its own variant when it forks.
+  analytics.setTag('variant', wizardMetadata.VARIANT);
 
   const mcpUrl = session.localMcp
     ? 'http://localhost:8787/mcp'
diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
index 8252e791..22ce11af 100644
--- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
+++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
@@ -7,6 +7,7 @@ import {
   buildRegistry,
   parseAgentPrompt,
   resolveTask,
+  taskModel,
   type AgentPrompt,
   type AgentRegistry,
   type OrchestratorPromptContext,
@@ -206,6 +207,23 @@ describe('resolveTask', () => {
   });
 });
 
+describe('taskModel', () => {
+  const prompt = parseAgentPrompt(
+    '---\nmodel: prompt-model\n---\nx',
+    'capture',
+  );
+
+  it('prefers the enqueue override, then the prompt, then the default', () => {
+    const registry = registryOf([prompt]);
+    const task = { type: 'capture' };
+    expect(taskModel(registry, { ...task, model: 'override' } as never)).toBe(
+      'override',
+    );
+    expect(taskModel(registry, task as never)).toBe('prompt-model');
+    expect(taskModel(registryOf([]), task as never)).toBe('claude-sonnet-4-6');
+  });
+});
+
 describe('assembleTaskPrompt', () => {
   const ctx: OrchestratorPromptContext = {
     projectId: 1,
diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts
index 4a18dee2..7f34f283 100644
--- a/src/lib/programs/orchestrator/__tests__/queue.test.ts
+++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts
@@ -7,6 +7,10 @@ import {
   type TaskHandoff,
 } from '@lib/programs/orchestrator/queue';
 
+jest.mock('@utils/analytics', () => ({
+  analytics: { captureException: jest.fn(), wizardCapture: jest.fn() },
+}));
+
 function tmpDir(): string {
   return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-test-'));
 }
@@ -132,4 +136,40 @@ describe('QueueStore', () => {
     expect(file.tasks[0].status).toBe('done');
     expect(file.tasks[0].handoff?.did).toBe('d');
   });
+
+  it('notifies the transition listener with post-transition task state', () => {
+    const seen: Array<{ event: string; status: string; attempts: number }> = [];
+    const listened = new QueueStore(dir, 'run-2', {
+      onTransition: (event, task) =>
+        seen.push({ event, status: task.status, attempts: task.attempts }),
+    });
+
+    const t = listened.enqueue({ type: 'install' });
+    listened.start(t.id);
+    listened.fail(t.id, { type: 'API_ERROR', message: 'boom' });
+    listened.requeue(t.id);
+    listened.start(t.id);
+    listened.complete(t.id);
+
+    expect(seen).toEqual([
+      { event: 'enqueue', status: 'pending', attempts: 0 },
+      { event: 'start', status: 'running', attempts: 1 },
+      { event: 'fail', status: 'failed', attempts: 1 },
+      { event: 'requeue', status: 'pending', attempts: 1 },
+      { event: 'start', status: 'running', attempts: 2 },
+      { event: 'complete', status: 'done', attempts: 2 },
+    ]);
+  });
+
+  it('a throwing listener does not break transitions', () => {
+    const listened = new QueueStore(dir, 'run-3', {
+      onTransition: () => {
+        throw new Error('listener boom');
+      },
+    });
+    const t = listened.enqueue({ type: 'install' });
+    listened.start(t.id);
+    listened.complete(t.id);
+    expect(listened.get(t.id)?.status).toBe('done');
+  });
 });
diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts
index 902adaee..3212a2c3 100644
--- a/src/lib/programs/orchestrator/agent-prompt-loader.ts
+++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts
@@ -49,6 +49,16 @@ function exampleReference(ctx: OrchestratorPromptContext): string | null {
   return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`;
 }
 
+/** The framework's rules ship with the reference skill; every task follows them. */
+function commandmentsReference(ctx: OrchestratorPromptContext): string | null {
+  if (!ctx.commandmentsPath) return null;
+  return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`;
+}
+
+const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`;
+
+const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`;
+
 /**
  * Points the agent at its installed task instructions (the HOW). They live under
  * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load
@@ -60,16 +70,6 @@ function skillReference(paths: readonly string[]): string | null {
   return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`;
 }
 
-/** The framework's rules ship with the reference skill; every task follows them. */
-function commandmentsReference(ctx: OrchestratorPromptContext): string | null {
-  if (!ctx.commandmentsPath) return null;
-  return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`;
-}
-
-const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`;
-
-const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`;
-
 /** A task agent's full prompt: injected basics, then the authored intent. */
 export function assembleTaskPrompt(
   ctx: OrchestratorPromptContext,
@@ -315,9 +315,14 @@ export function resolveTask(
     .join('\n\n');
 
   return {
-    model: task.model ?? prompt.model ?? DEFAULT_TASK_MODEL,
+    model: taskModel(registry, task),
     ...agentRunTools(prompt),
     prompt: body,
     skills: prompt.skills,
   };
 }
+
+/** The model a task runs on: enqueue override, then prompt frontmatter, then default. */
+export function taskModel(registry: AgentRegistry, task: QueuedTask): string {
+  return task.model ?? registry.get(task.type)?.model ?? DEFAULT_TASK_MODEL;
+}
diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts
index 8f2aac90..b6129016 100644
--- a/src/lib/programs/orchestrator/orchestrator-runner.ts
+++ b/src/lib/programs/orchestrator/orchestrator-runner.ts
@@ -27,7 +27,12 @@ import { logToFile } from '../../../utils/debug';
 import type { ProgramConfig } from '../program-step';
 import type { BootstrapResult } from '../../agent/agent-runner';
 import type { WizardRunOptions } from '../../../utils/types';
-import { QueueStore, QUEUE_DIR_NAME, TaskStatus } from './queue';
+import {
+  QueueStore,
+  QUEUE_DIR_NAME,
+  TaskStatus,
+  type QueuedTask,
+} from './queue';
 import { drainQueue, type RunTask } from './executor';
 import {
   agentRunTools,
@@ -35,6 +40,7 @@ import {
   assembleTaskPrompt,
   loadAgentRegistry,
   resolveTask,
+  taskModel,
   type OrchestratorPromptContext,
 } from './agent-prompt-loader';
 
@@ -73,7 +79,6 @@ export async function runOrchestrator(
   boot: BootstrapResult,
 ): Promise<void> {
   const runId = randomUUID();
-  const store = new QueueStore(session.installDir, runId);
 
   const options = sessionRunOptions(session);
 
@@ -91,6 +96,74 @@ export async function runOrchestrator(
     );
   }
 
+  // Every wizard event from here on carries the variant, so orchestrator runs
+  // segment cleanly from the linear baseline.
+  analytics.setTag('variant', 'orchestrator');
+
+  // Responsiveness is the headline metric of the dark launch: time to first
+  // visible progress, and no single step dominating wall-clock. Track it from
+  // queue transitions, with the resolved model so cheap work is attributable
+  // to cheap models.
+  const runStartMs = Date.now();
+  let firstStartMs: number | undefined;
+  let lastStartMs: number | undefined;
+  const durationMs = (t: QueuedTask) =>
+    t.startedAt && t.finishedAt
+      ? Date.parse(t.finishedAt) - Date.parse(t.startedAt)
+      : undefined;
+
+  const store = new QueueStore(session.installDir, runId, {
+    onTransition: (event, task) => {
+      const base = {
+        type: task.type,
+        model: taskModel(registry, task),
+        attempts: task.attempts,
+      };
+      switch (event) {
+        case 'enqueue':
+          analytics.wizardCapture('orchestrator task enqueued', {
+            type: task.type,
+            enqueued_by: task.enqueuedBy,
+            dynamic: task.enqueuedBy !== 'orchestrator',
+          });
+          break;
+        case 'start': {
+          const now = Date.now();
+          analytics.wizardCapture('orchestrator task started', {
+            ...base,
+            ms_since_run_start: now - runStartMs,
+            gap_since_prev_start_ms:
+              lastStartMs === undefined ? undefined : now - lastStartMs,
+          });
+          firstStartMs ??= now;
+          lastStartMs = now;
+          break;
+        }
+        case 'complete':
+          analytics.wizardCapture('orchestrator task completed', {
+            ...base,
+            duration_ms: durationMs(task),
+          });
+          break;
+        case 'skip':
+          analytics.wizardCapture('orchestrator task skipped', {
+            ...base,
+            duration_ms: durationMs(task),
+          });
+          break;
+        case 'fail':
+          analytics.wizardCapture('orchestrator task failed', {
+            ...base,
+            duration_ms: durationMs(task),
+            error: task.error?.type,
+          });
+          break;
+        case 'requeue':
+          break;
+      }
+    },
+  });
+
   // Give task agents the framework's finished reference integration to match,
   // the same EXAMPLE.md the linear flow uses. Install it under the run dir rather
   // than .claude/skills so its "do everything" workflow is not auto-loaded as a
@@ -191,6 +264,7 @@ export async function runOrchestrator(
       successMessage: 'Planned the integration',
       additionalFeatureQueue: [],
       requestRemark: false,
+      analyticsProperties: { task_type: 'seed' },
     },
   );
   if (seedResult.error) {
@@ -211,6 +285,7 @@ export async function runOrchestrator(
   // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then
   // runs on its own model and tools.
   const taskSkillsRoot = path.join(QUEUE_DIR_NAME, 'skills');
+  let remarkRequested = false;
   const runTask: RunTask = async (task) => {
     renderQueue();
     try {
@@ -236,6 +311,18 @@ export async function runOrchestrator(
           );
         }
       }
+      // The run-end reflection fires once, on the task that is last in the
+      // queue when it starts — nothing else pending or running alongside it.
+      const isLastTask = !store
+        .list()
+        .some(
+          (t) =>
+            t.id !== task.id &&
+            (t.status === TaskStatus.Pending ||
+              t.status === TaskStatus.Running),
+        );
+      const requestRemark = isLastTask && !remarkRequested;
+      if (requestRemark) remarkRequested = true;
       await runAgent(
         {
           ...agent,
@@ -249,12 +336,12 @@ export async function runOrchestrator(
         // Empty messages suppress the per-task spinner lines (the spinner renders
         // only when a message is set); the queue panel shows progress. Errors
         // still surface — runAgent stops the spinner with its own error text.
-        // No per-task remark — the reflection would fire on every task.
         {
           spinnerMessage: '',
           successMessage: '',
           additionalFeatureQueue: [],
-          requestRemark: false,
+          requestRemark,
+          analyticsProperties: { task_type: task.type, task_id: task.id },
         },
       );
     } finally {
@@ -281,6 +368,10 @@ export async function runOrchestrator(
     tasks_total: summary.total,
     tasks_done: summary.done,
     tasks_failed: summary.failed,
+    tasks_skipped: summary.skipped,
+    total_duration_ms: Date.now() - runStartMs,
+    time_to_first_task_ms:
+      firstStartMs === undefined ? undefined : firstStartMs - runStartMs,
   });
 
   // The build step flags any unresolved conflict in its handoff; surface the
diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts
index 4ecc3cb5..302897e6 100644
--- a/src/lib/programs/orchestrator/queue.ts
+++ b/src/lib/programs/orchestrator/queue.ts
@@ -14,6 +14,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import { randomUUID } from 'crypto';
 import { writeJsonAtomic } from '../../../utils/atomic-ledger';
+import { analytics } from '../../../utils/analytics';
 
 export const TaskStatus = {
   Pending: 'pending',
@@ -75,17 +76,40 @@ export interface EnqueueInput {
 export const QUEUE_DIR_NAME = '.posthog-wizard';
 const DEFAULT_MAX_ATTEMPTS = 2;
 
+/** Every queue transition, in the order it is reflected. */
+export type TransitionEvent =
+  | 'enqueue'
+  | 'start'
+  | 'complete'
+  | 'skip'
+  | 'fail'
+  | 'requeue';
+
+export interface QueueStoreOptions {
+  /**
+   * Called on every transition with the task's post-transition state. The
+   * runner uses it for telemetry; the store itself stays analytics-free.
+   * Listener errors are reported but cannot break a transition.
+   */
+  onTransition?: (event: TransitionEvent, task: QueuedTask) => void;
+}
+
 function nowIso(): string {
   return new Date().toISOString();
 }
 
 export class QueueStore {
   private tasks: QueuedTask[] = [];
+  private readonly onTransition?: (
+    event: TransitionEvent,
+    task: QueuedTask,
+  ) => void;
 
   readonly runId: string;
   readonly queuePath: string;
 
-  constructor(installDir: string, runId: string) {
+  constructor(installDir: string, runId: string, opts?: QueueStoreOptions) {
+    this.onTransition = opts?.onTransition;
     this.runId = runId;
     const dir = path.join(installDir, QUEUE_DIR_NAME);
     this.queuePath = path.join(dir, 'queue.json');
@@ -172,6 +196,7 @@ export class QueueStore {
     };
     this.tasks.push(task);
     this.reflect();
+    this.notify('enqueue', task);
     return task;
   }
 
@@ -181,6 +206,7 @@ export class QueueStore {
     t.startedAt = nowIso();
     t.attempts += 1;
     this.reflect();
+    this.notify('start', t);
     return t;
   }
 
@@ -210,6 +236,7 @@ export class QueueStore {
     t.startedAt = undefined;
     t.finishedAt = undefined;
     this.reflect();
+    this.notify('requeue', t);
     return t;
   }
 
@@ -225,6 +252,14 @@ export class QueueStore {
     t.status = status;
     t.finishedAt = nowIso();
     this.reflect();
+    this.notify(
+      status === TaskStatus.Done
+        ? 'complete'
+        : status === TaskStatus.Skipped
+        ? 'skip'
+        : 'fail',
+      t,
+    );
     return t;
   }
 
@@ -237,6 +272,18 @@ export class QueueStore {
     writeJsonAtomic(this.queuePath, file);
   }
 
+  private notify(event: TransitionEvent, task: QueuedTask): void {
+    try {
+      this.onTransition?.(event, task);
+    } catch (error) {
+      // A listener must never break a transition, but its failure is a bug.
+      analytics.captureException(
+        error instanceof Error ? error : new Error(String(error)),
+        { step: 'orchestrator_queue_listener', event },
+      );
+    }
+  }
+
   private require(id: string): QueuedTask {
     const t = this.get(id);
     if (!t) throw new Error(`No task ${id} in the queue`);

From 1d8ef519a8ebef70a93063b8ae76750d8af890a0 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:23:18 -0400
Subject: [PATCH 10/12] feat(orchestrator): ci-excluded task types (#639)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/smoke-test.sh                         | 26 +++++++------
 src/env.ts                                    |  1 +
 .../__tests__/agent-prompt-loader.test.ts     | 14 ++++++-
 .../agent-prompt-loader.ts                    | 16 ++++++--
 src/lib/agent/agent-runner.ts                 |  8 ++--
 src/lib/agent/mcp-prompt-streaming.ts         | 39 +++----------------
 .../__tests__/queue-tools.test.ts             | 12 ++++++
 src/lib/programs/orchestrator/executor.ts     |  2 +-
 .../orchestrator/orchestrator-runner.ts       |  4 +-
 src/lib/programs/orchestrator/queue-tools.ts  | 20 +++++++++-
 src/lib/programs/orchestrator/queue.ts        | 18 ++++++---
 src/utils/__tests__/ci-flag-overrides.test.ts | 36 ++++++++++++++++-
 src/utils/ci-flag-overrides.ts                | 35 +++++++++++++----
 13 files changed, 160 insertions(+), 71 deletions(-)
 rename src/lib/{programs/orchestrator => agent}/__tests__/agent-prompt-loader.test.ts (93%)
 rename src/lib/{programs/orchestrator => agent}/agent-prompt-loader.ts (95%)

diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh
index 1e9a690b..fcdab7f3 100755
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@@ -24,14 +24,16 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH
 # builds and tsdown strips it; its env var name appearing in dist/*.js means
 # dead-code elimination regressed and a prod surface leaked. Sourcemaps keep
 # the original source, so only .js output counts.
-OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES'
+OVERRIDE_MARKERS='WIZARD_CI_FLAG_OVERRIDES WIZARD_CI_EXCLUDE_TASKS'
 if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
-  # CI builds must keep the path — its absence means the override silently
-  # stopped working and CI is back to testing live flags.
-  if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
-    echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2
-    exit 1
-  fi
+  # CI builds must keep the paths — their absence means the overrides silently
+  # stopped working and CI is back to testing live behavior.
+  for marker in $OVERRIDE_MARKERS; do
+    if ! grep -q "$marker" ./dist/*.js; then
+      echo "Smoke test failed: CI build is missing the $marker path" >&2
+      exit 1
+    fi
+  done
   # And a real invocation must accept the env var. yargs claims every
   # POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects
   # unknown ones during command parse (--version/--help short-circuit and
@@ -44,10 +46,12 @@ if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
     exit 1
   fi
 else
-  if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
-    echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2
-    exit 1
-  fi
+  for marker in $OVERRIDE_MARKERS; do
+    if grep -q "$marker" ./dist/*.js; then
+      echo "Smoke test failed: $marker code leaked into a production build" >&2
+      exit 1
+    fi
+  done
 fi
 
 # ── 3. --ci rejected in production builds ────────────────────────────────────
diff --git a/src/env.ts b/src/env.ts
index c32e886a..4b727441 100644
--- a/src/env.ts
+++ b/src/env.ts
@@ -43,6 +43,7 @@ type RuntimeEnvKey =
   // Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD')
   // would claim it as an unknown CLI option and strict-reject the run.
   | 'WIZARD_CI_FLAG_OVERRIDES'
+  | 'WIZARD_CI_EXCLUDE_TASKS'
   // Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix)
   | 'POSTHOG_WIZARD_BENCHMARK_CONFIG'
   | 'POSTHOG_WIZARD_BENCHMARK_FILE'
diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts
similarity index 93%
rename from src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
rename to src/lib/agent/__tests__/agent-prompt-loader.test.ts
index 22ce11af..a0a2b04a 100644
--- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts
+++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts
@@ -12,7 +12,7 @@ import {
   type AgentRegistry,
   type OrchestratorPromptContext,
 } from '../agent-prompt-loader';
-import { QueueStore } from '../queue';
+import { QueueStore } from '../../programs/orchestrator/queue';
 
 function tmpDir(): string {
   return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-'));
@@ -131,6 +131,18 @@ describe('buildRegistry', () => {
     // A flowless prompt (e.g. the documentation example) joins no registry.
     expect(registry.get('example')).toBeUndefined();
   });
+
+  it('drops harness-excluded types; unrestricted runs keep them', () => {
+    const prompts = [
+      prompt({ type: 'plan', flow: 'f', seed: true }),
+      prompt({ type: 'build', flow: 'f' }),
+      prompt({ type: 'dashboard', flow: 'f' }),
+    ];
+    expect(
+      buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types,
+    ).toEqual(['build']);
+    expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']);
+  });
 });
 
 describe('resolveTask', () => {
diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts
similarity index 95%
rename from src/lib/programs/orchestrator/agent-prompt-loader.ts
rename to src/lib/agent/agent-prompt-loader.ts
index 3212a2c3..1fe487b1 100644
--- a/src/lib/programs/orchestrator/agent-prompt-loader.ts
+++ b/src/lib/agent/agent-prompt-loader.ts
@@ -15,8 +15,8 @@
  * network latency. The registry's type list also drives `enqueue_task`
  * validation.
  */
-import type { QueueStore, QueuedTask } from './queue';
-import type { ResolvedTask } from './executor';
+import type { QueueStore, QueuedTask } from '../programs/orchestrator/queue';
+import type { ResolvedTask } from '../programs/orchestrator/executor';
 
 /**
  * The basics the client injects around every agent-prompt body. The `/agents/`
@@ -137,8 +137,15 @@ export interface AgentRegistry {
 export function buildRegistry(
   prompts: readonly AgentPrompt[],
   flow: string,
+  opts?: { exclude?: readonly string[] },
 ): AgentRegistry {
-  const inFlow = prompts.filter((p) => p.flow === flow);
+  // The harness can exclude task types (CI excludes dashboards). An excluded
+  // type does not exist for the run: the seed cannot enqueue it and no agent
+  // is ever spun up for it.
+  const excluded = new Set(opts?.exclude ?? []);
+  const inFlow = prompts.filter(
+    (p) => p.flow === flow && !excluded.has(p.type),
+  );
   const byType = new Map(inFlow.map((p) => [p.type, p]));
   return {
     types: inFlow.filter((p) => !p.seed).map((p) => p.type),
@@ -238,6 +245,7 @@ async function fetchText(url: string): Promise<string> {
 export async function loadAgentRegistry(
   skillsBaseUrl: string,
   flow: string,
+  opts?: { exclude?: readonly string[] },
 ): Promise<AgentRegistry> {
   const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`);
   const menu = JSON.parse(menuRaw) as AgentMenu;
@@ -249,7 +257,7 @@ export async function loadAgentRegistry(
     }),
   );
 
-  return buildRegistry(prompts, flow);
+  return buildRegistry(prompts, flow, opts);
 }
 
 /**
diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index 2d693058..0f3269c4 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -372,12 +372,12 @@ async function bootstrapProgram(
   // orchestrator arm overwrites this with its own variant when it forks.
   analytics.setTag('variant', wizardMetadata.VARIANT);
 
+  // One MCP url for every region: the server resolves the user's region from
+  // the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is
+  // not needed here.
   const mcpUrl = session.localMcp
     ? 'http://localhost:8787/mcp'
-    : runtimeEnv('MCP_URL') ||
-      (cloudRegion === 'eu'
-        ? 'https://mcp-eu.posthog.com/mcp'
-        : 'https://mcp.posthog.com/mcp');
+    : runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp';
 
   return {
     skillsBaseUrl,
diff --git a/src/lib/agent/mcp-prompt-streaming.ts b/src/lib/agent/mcp-prompt-streaming.ts
index dc8f8ff9..b3655f12 100644
--- a/src/lib/agent/mcp-prompt-streaming.ts
+++ b/src/lib/agent/mcp-prompt-streaming.ts
@@ -42,38 +42,11 @@ const MODEL = 'claude-sonnet-4-6';
 // telemetry on average turn counts per prompt.
 const MAX_TURNS = 30;
 
-function resolveMcpUrl(host: string): string {
-  const override = runtimeEnv('MCP_URL');
-  if (override) return override;
-  // Parse the actual hostname rather than substring-matching the raw
-  // input. `host.includes('eu.posthog.com')` would let arbitrary URLs
-  // like `https://evil.eu.posthog.com.attacker.com` or
-  // `https://useu.posthog.commerce` route to the EU MCP endpoint
-  // (CodeQL: incomplete-url-substring-sanitization). Parsing into a
-  // hostname and checking exact match / trusted subdomain blocks both.
-  const hostname = parseHostname(host);
-  const isEu =
-    hostname === 'eu.posthog.com' || hostname.endsWith('.eu.posthog.com');
-  return isEu
-    ? 'https://mcp-eu.posthog.com/mcp'
-    : 'https://mcp.posthog.com/mcp';
-}
-
-/**
- * Normalize a host string into a hostname suitable for trust checks.
- * Accepts either a full URL (`https://us.posthog.com`) or a bare host
- * (`us.posthog.com`). Returns the hostname lowercased, or the trimmed
- * input lowercased if parsing fails (defensive fallback so a malformed
- * value still resolves to the safer-default US endpoint).
- */
-function parseHostname(raw: string): string {
-  const trimmed = raw.trim().toLowerCase();
-  try {
-    const withScheme = trimmed.includes('://') ? trimmed : `https://${trimmed}`;
-    return new URL(withScheme).hostname.toLowerCase();
-  } catch {
-    return trimmed;
-  }
+// One MCP url for every region: the server resolves the user's region from
+// the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is
+// not needed here.
+function resolveMcpUrl(): string {
+  return runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp';
 }
 
 /**
@@ -245,7 +218,7 @@ export async function* runMcpPromptViaSdk(args: {
       once: true,
     });
 
-  const mcpUrl = resolveMcpUrl(credentials.host);
+  const mcpUrl = resolveMcpUrl();
   logToFile(
     `[runMcpPromptViaSdk] mcpUrl=${mcpUrl} model=${MODEL} resume=${
       resumeSessionId ?? '(none)'
diff --git a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
index 318825d2..33def856 100644
--- a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
+++ b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
@@ -57,6 +57,18 @@ describe('checkEnqueueGuards', () => {
     const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' });
     expect(r).toEqual({ ok: true });
   });
+
+  it('refuses to grow the queue past the runaway cap', () => {
+    for (let i = 0; i < 30; i++) {
+      store.enqueue({ type: 'capture', inputs: { i } });
+    }
+    const r = checkEnqueueGuards(ctx, {
+      type: 'init',
+      inputs: { i: 30 },
+      reason: 'x',
+    });
+    expect(r).toMatchObject({ ok: false, guard: 'queue-full' });
+  });
 });
 
 describe('apply functions', () => {
diff --git a/src/lib/programs/orchestrator/executor.ts b/src/lib/programs/orchestrator/executor.ts
index abf0ed15..d8cfb976 100644
--- a/src/lib/programs/orchestrator/executor.ts
+++ b/src/lib/programs/orchestrator/executor.ts
@@ -101,7 +101,7 @@ export async function drainQueue(
   for (;;) {
     for (const task of store.nextRunnable()) {
       if (++starts > opts.maxStarts) break;
-      // runOne marks the task in_progress synchronously, so the next
+      // runOne marks the task running synchronously, so the next
       // nextRunnable() call no longer offers it.
       const p = runOne(store, runTask, task).finally(() =>
         running.delete(task.id),
diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts
index b6129016..31df7372 100644
--- a/src/lib/programs/orchestrator/orchestrator-runner.ts
+++ b/src/lib/programs/orchestrator/orchestrator-runner.ts
@@ -23,6 +23,7 @@ import { detectNodePackageManagers } from '../../detection/package-manager';
 import { installSkillById } from '../../wizard-tools';
 import { getUI } from '../../../ui';
 import { analytics } from '../../../utils/analytics';
+import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides';
 import { logToFile } from '../../../utils/debug';
 import type { ProgramConfig } from '../program-step';
 import type { BootstrapResult } from '../../agent/agent-runner';
@@ -42,7 +43,7 @@ import {
   resolveTask,
   taskModel,
   type OrchestratorPromptContext,
-} from './agent-prompt-loader';
+} from '../../agent/agent-prompt-loader';
 
 function toTodoStatus(status: TaskStatus): string {
   switch (status) {
@@ -88,6 +89,7 @@ export async function runOrchestrator(
   const registry = await loadAgentRegistry(
     boot.skillsBaseUrl,
     programConfig.id,
+    { exclude: ciExcludedTaskTypes() },
   );
   const seedPrompt = registry.seed;
   if (!seedPrompt) {
diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts
index 64e5bc93..5a05cda0 100644
--- a/src/lib/programs/orchestrator/queue-tools.ts
+++ b/src/lib/programs/orchestrator/queue-tools.ts
@@ -55,10 +55,18 @@ function dedupKey(type: string, inputs: Record<string, unknown>): string {
   return `${type}::${stableStringify(inputs)}`;
 }
 
+/**
+ * A backstop on total queue size. Tasks can enqueue tasks, so a misbehaving
+ * type could grow the queue without bound. Keeping the graph small is the job
+ * of good agent and skill design, not this number — it only stops a runaway.
+ * The real flow is ~9 tasks, so this sits well clear of it.
+ */
+const MAX_QUEUE_TASKS = 30;
+
 /**
  * Validate an enqueue. Structural checks only — a real type, real dependencies,
- * and not a literal duplicate. How much runs, and in what shape, is the task
- * graph's business, not a knob's.
+ * not a literal duplicate, and not past the runaway backstop. How much runs,
+ * and in what shape, is the task graph's business, not a knob's.
  */
 export function checkEnqueueGuards(
   ctx: OrchestratorToolsContext,
@@ -66,6 +74,14 @@ export function checkEnqueueGuards(
 ): GuardResult {
   const tasks = ctx.store.list();
 
+  if (tasks.length >= MAX_QUEUE_TASKS) {
+    return {
+      ok: false,
+      guard: 'queue-full',
+      message: `The queue already holds ${tasks.length} tasks (cap ${MAX_QUEUE_TASKS}). Refine the existing tasks rather than adding more.`,
+    };
+  }
+
   if (!ctx.validTypes.includes(args.type)) {
     return {
       ok: false,
diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts
index 302897e6..19545d9d 100644
--- a/src/lib/programs/orchestrator/queue.ts
+++ b/src/lib/programs/orchestrator/queue.ts
@@ -32,6 +32,12 @@ export interface QueuedTask {
   /** Human-readable label for the TUI, set by the enqueuing agent. */
   label?: string;
   status: TaskStatus;
+  /**
+   * Ids of tasks that must finish before this one runs. Ids are generated at
+   * enqueue and dependsOn is never mutated, so a task can only depend on tasks
+   * created before it — the graph is a DAG by construction, cycles cannot
+   * form. Unknown ids are rejected by the enqueue_task guard.
+   */
   dependsOn: string[];
   inputs: Record<string, unknown>;
   model?: string;
@@ -76,6 +82,10 @@ export interface EnqueueInput {
 export const QUEUE_DIR_NAME = '.posthog-wizard';
 const DEFAULT_MAX_ATTEMPTS = 2;
 
+function nowIso(): string {
+  return new Date().toISOString();
+}
+
 /** Every queue transition, in the order it is reflected. */
 export type TransitionEvent =
   | 'enqueue'
@@ -94,10 +104,6 @@ export interface QueueStoreOptions {
   onTransition?: (event: TransitionEvent, task: QueuedTask) => void;
 }
 
-function nowIso(): string {
-  return new Date().toISOString();
-}
-
 export class QueueStore {
   private tasks: QueuedTask[] = [];
   private readonly onTransition?: (
@@ -147,7 +153,7 @@ export class QueueStore {
   }
 
   /**
-   * True when no task is in progress and none can be started. Either everything
+   * True when no task is running and none can be started. Either everything
    * is terminal, or the only pending tasks are blocked by a failed dependency.
    */
   isDrained(): boolean {
@@ -229,7 +235,7 @@ export class QueueStore {
     return this.finish(id, TaskStatus.Failed, handoff);
   }
 
-  /** Put a failed/in-progress task back to pending for a retry within the run. */
+  /** Put a failed/running task back to pending for a retry within the run. */
   requeue(id: string): QueuedTask {
     const t = this.require(id);
     t.status = TaskStatus.Pending;
diff --git a/src/utils/__tests__/ci-flag-overrides.test.ts b/src/utils/__tests__/ci-flag-overrides.test.ts
index 4d2333a1..4f0d844f 100644
--- a/src/utils/__tests__/ci-flag-overrides.test.ts
+++ b/src/utils/__tests__/ci-flag-overrides.test.ts
@@ -1,4 +1,7 @@
-import { applyCiFlagOverrides } from '@utils/ci-flag-overrides';
+import {
+  applyCiFlagOverrides,
+  ciExcludedTaskTypes,
+} from '@utils/ci-flag-overrides';
 
 jest.mock('@utils/debug', () => ({
   logToFile: jest.fn(),
@@ -61,3 +64,34 @@ describe('applyCiFlagOverrides', () => {
     });
   });
 });
+
+describe('ciExcludedTaskTypes', () => {
+  afterEach(() => {
+    delete process.env.WIZARD_CI_EXCLUDE_TASKS;
+  });
+
+  it('is empty when nothing is excluded', () => {
+    expect(ciExcludedTaskTypes()).toEqual([]);
+  });
+
+  it('parses the comma-separated list, ignoring stray whitespace', () => {
+    process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard, report ,';
+    expect(ciExcludedTaskTypes()).toEqual(['dashboard', 'report']);
+  });
+
+  it('is inert in production builds', () => {
+    const prevNodeEnv = process.env.NODE_ENV;
+    process.env.NODE_ENV = 'production';
+    process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard';
+    let result: readonly string[] | undefined;
+    jest.isolateModules(() => {
+      // eslint-disable-next-line @typescript-eslint/no-var-requires
+      const prod = require('@utils/ci-flag-overrides') as {
+        ciExcludedTaskTypes: typeof ciExcludedTaskTypes;
+      };
+      result = prod.ciExcludedTaskTypes();
+    });
+    process.env.NODE_ENV = prevNodeEnv;
+    expect(result).toEqual([]);
+  });
+});
diff --git a/src/utils/ci-flag-overrides.ts b/src/utils/ci-flag-overrides.ts
index e8790e23..475060c3 100644
--- a/src/utils/ci-flag-overrides.ts
+++ b/src/utils/ci-flag-overrides.ts
@@ -3,14 +3,15 @@
  *
  * CI must route deterministically: a run that tests the orchestrator arm says
  * so explicitly instead of depending on a live feature flag someone can edit
- * mid-week. `WIZARD_CI_FLAG_OVERRIDES` is a JSON object of flag key →
- * value, merged over whatever PostHog returned.
+ * mid-week. The override env var (see the allowlist in `env.ts`) is a JSON
+ * object of flag key → value, merged over whatever PostHog returned.
  *
  * The override path exists only in CI builds (`pnpm build:ci`). Published
- * builds inline NODE_ENV as the literal "production", the guard below
- * collapses, and tsdown strips the rest from the bundle — and the smoke test
- * asserts the env var's name is physically absent from production output, so
- * this can never quietly become a production surface.
+ * builds inline NODE_ENV as the literal "production", the guards collapse,
+ * and tsdown strips the rest from the bundle — and the smoke test asserts the
+ * env var names are physically absent from production output (which is also
+ * why no comment in this file may spell them out), so this can never quietly
+ * become a production surface.
  */
 import { runtimeEnv } from '@env';
 import { logToFile } from './debug';
@@ -33,7 +34,7 @@ export function applyCiFlagOverrides(
     // A malformed override is a CI misconfiguration. Fail the run loudly
     // rather than silently testing whatever the live flags happen to say.
     throw new Error(
-      'WIZARD_CI_FLAG_OVERRIDES is not valid JSON (expected {"flag-key": value, ...}).',
+      'The CI flag-override env var is not valid JSON (expected {"flag-key": value, ...}).',
     );
   }
 
@@ -44,3 +45,23 @@ export function applyCiFlagOverrides(
   logToFile('[flags] CI overrides applied', overrides);
   return merged;
 }
+
+/**
+ * Task types excluded from this run. The exclusion env var (see the allowlist
+ * in `env.ts`) is a comma-separated list (e.g. `dashboard`), set by the CI
+ * harness that owns the policy — the wizard and the served content stay
+ * run-mode agnostic. CI-build only, same as the flag overrides: published
+ * builds strip this path.
+ */
+export function ciExcludedTaskTypes(): readonly string[] {
+  if (process.env.NODE_ENV === 'production') return [];
+
+  const raw = runtimeEnv('WIZARD_CI_EXCLUDE_TASKS');
+  if (!raw) return [];
+  const types = raw
+    .split(',')
+    .map((t) => t.trim())
+    .filter(Boolean);
+  if (types.length > 0) logToFile('[flags] CI task exclusions', types);
+  return types;
+}

From d7077e161d3b40415d9caf67f2915987e219b15a Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:23:28 -0400
Subject: [PATCH 11/12] feat(orchestrator): self-deleting run cache +
 responsiveness analytics (#677)

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
---
 .../__tests__/agent-prompt-loader.test.ts     | 40 +++++++++
 src/lib/agent/agent-prompt-loader.ts          | 35 ++++++--
 .../orchestrator/__tests__/queue.test.ts      | 10 +++
 .../__tests__/run-metrics.test.ts             | 68 +++++++++++++++
 .../orchestrator/orchestrator-runner.ts       | 81 +++++++++++++-----
 src/lib/programs/orchestrator/queue.ts        | 19 ++++-
 src/lib/programs/orchestrator/run-metrics.ts  | 85 +++++++++++++++++++
 src/ui/tui/primitives/LogViewer.tsx           |  6 +-
 8 files changed, 311 insertions(+), 33 deletions(-)
 create mode 100644 src/lib/programs/orchestrator/__tests__/run-metrics.test.ts
 create mode 100644 src/lib/programs/orchestrator/run-metrics.ts

diff --git a/src/lib/agent/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts
index a0a2b04a..d2ad5b7a 100644
--- a/src/lib/agent/__tests__/agent-prompt-loader.test.ts
+++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts
@@ -217,6 +217,46 @@ describe('resolveTask', () => {
       'Context from previous steps',
     );
   });
+
+  it('includes transitive ancestors, not just direct dependencies', () => {
+    const registry = registryOf([prompt]);
+    // install -> capture -> (this task). The task depends only on capture, but
+    // install's context must still reach it so nothing is silently lost.
+    const install = store.enqueue({ type: 'install' });
+    store.complete(install.id, {
+      goals: 'declare the SDK',
+      did: 'added posthog to the manifest',
+      forNextAgent: 'SDK is declared, not yet installed',
+    });
+    const capture = store.enqueue({ type: 'capture', dependsOn: [install.id] });
+    store.complete(capture.id, {
+      goals: 'instrument events',
+      did: 'added capture calls',
+      forNextAgent: 'events are in',
+    });
+    const task = store.enqueue({ type: 'capture', dependsOn: [capture.id] });
+    const { prompt: out } = resolveTask(registry, task, store);
+    expect(out).toContain('added posthog to the manifest'); // transitive
+    expect(out).toContain('added capture calls'); // direct
+  });
+
+  it('lists each ancestor once for diamond dependencies', () => {
+    const registry = registryOf([prompt]);
+    const install = store.enqueue({ type: 'install' });
+    store.complete(install.id, {
+      goals: 'g',
+      did: 'manifest entry added',
+      forNextAgent: 'n',
+    });
+    const a = store.enqueue({ type: 'identify', dependsOn: [install.id] });
+    store.complete(a.id, { goals: 'g', did: 'a-did', forNextAgent: 'n' });
+    const b = store.enqueue({ type: 'identify', dependsOn: [install.id] });
+    store.complete(b.id, { goals: 'g', did: 'b-did', forNextAgent: 'n' });
+    // Resolved task must be a registered type (capture); its ancestors need not be.
+    const task = store.enqueue({ type: 'capture', dependsOn: [a.id, b.id] });
+    const { prompt: out } = resolveTask(registry, task, store);
+    expect(out.match(/manifest entry added/g)).toHaveLength(1);
+  });
 });
 
 describe('taskModel', () => {
diff --git a/src/lib/agent/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts
index 1fe487b1..8b83ff9a 100644
--- a/src/lib/agent/agent-prompt-loader.ts
+++ b/src/lib/agent/agent-prompt-loader.ts
@@ -55,7 +55,7 @@ function commandmentsReference(ctx: OrchestratorPromptContext): string | null {
   return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`;
 }
 
-const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`;
+const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only inside this project's own directory: never read, list, or search (find, ls, grep, glob) outside it — not the OS, not other projects, not global package caches. If your task seems to need something outside this directory, it does not — skip that part and say so in your handoff rather than hunting across the filesystem. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`;
 
 const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`;
 
@@ -277,14 +277,37 @@ function formatInputValue(value: unknown): string {
 }
 
 /**
- * Render the handoffs of a task's completed dependencies into a context section,
- * so a fresh agent sees what the upstream steps did. Empty when there are none.
+ * The ids of every task `task` transitively depends on — the full upstream
+ * chain, not just direct dependencies — ordered roots-first, each once. A `seen`
+ * set dedupes diamonds and guards against cycles.
+ */
+function ancestorIds(task: QueuedTask, store: QueueStore): string[] {
+  const seen = new Set<string>();
+  const ordered: string[] = [];
+  const visit = (id: string): void => {
+    if (seen.has(id)) return;
+    seen.add(id);
+    const t = store.get(id);
+    if (!t) return;
+    for (const dep of t.dependsOn) visit(dep); // ancestors before dependents
+    ordered.push(id);
+  };
+  for (const dep of task.dependsOn) visit(dep);
+  return ordered;
+}
+
+/**
+ * Render the handoffs of every step `task` transitively depends on into a context
+ * section, so a fresh agent sees the whole upstream chain — not just its direct
+ * dependencies. Reliability over token economy: a step must never have to
+ * re-discover what any ancestor already established just because an intermediate
+ * handoff happened to omit it. Empty when there are no completed ancestors.
  */
 function renderHandoffContext(task: QueuedTask, store: QueueStore): string {
   const lines: string[] = [];
-  for (const depId of task.dependsOn) {
-    const dep = store.get(depId);
-    const handoff = store.readHandoff(depId);
+  for (const id of ancestorIds(task, store)) {
+    const dep = store.get(id);
+    const handoff = store.readHandoff(id);
     if (!dep || !handoff) continue;
     lines.push(`### ${dep.type}`);
     lines.push(`- did: ${handoff.did}`);
diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts
index 7f34f283..3b493d0f 100644
--- a/src/lib/programs/orchestrator/__tests__/queue.test.ts
+++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts
@@ -3,6 +3,7 @@ import * as os from 'os';
 import * as path from 'path';
 import {
   QueueStore,
+  QUEUE_DIR_NAME,
   type QueueFile,
   type TaskHandoff,
 } from '@lib/programs/orchestrator/queue';
@@ -28,6 +29,15 @@ describe('QueueStore', () => {
     fs.rmSync(dir, { recursive: true, force: true });
   });
 
+  it('drops a self-explaining .DELETE-ME.md in the cache folder', () => {
+    const note = fs.readFileSync(
+      path.join(dir, QUEUE_DIR_NAME, '.DELETE-ME.md'),
+      'utf8',
+    );
+    expect(note).toContain('safely delete');
+    expect(note).toContain(`${QUEUE_DIR_NAME}/`);
+  });
+
   it('enqueues a pending task with defaults', () => {
     const t = q.enqueue({ type: 'install' });
     expect(t.status).toBe('pending');
diff --git a/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts
new file mode 100644
index 00000000..544f17b2
--- /dev/null
+++ b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts
@@ -0,0 +1,68 @@
+import { RunMetrics } from '@lib/programs/orchestrator/run-metrics';
+
+describe('RunMetrics', () => {
+  it('reports time to first start and first completion from run start', () => {
+    const m = new RunMetrics(0);
+    m.recordStart(100);
+    m.recordComplete(300);
+    m.recordStart(1000);
+    m.recordComplete(1100);
+    const s = m.summary();
+    expect(s.time_to_first_task_ms).toBe(100);
+    expect(s.time_to_first_completion_ms).toBe(300);
+  });
+
+  it('max_gap_ms is the longest silence across all visible transitions', () => {
+    const m = new RunMetrics(0);
+    m.recordStart(100); // visible @100
+    m.recordComplete(300); // gap 200
+    m.recordStart(1000); // gap 700  ← longest
+    m.recordComplete(1100); // gap 100
+    expect(m.summary().max_gap_ms).toBe(700);
+  });
+
+  it('recordStart returns ms_since_run_start and the gap from the previous start', () => {
+    const m = new RunMetrics(0);
+    expect(m.recordStart(100)).toEqual({
+      ms_since_run_start: 100,
+      gap_since_prev_start_ms: undefined,
+    });
+    expect(m.recordStart(1000)).toEqual({
+      ms_since_run_start: 1000,
+      gap_since_prev_start_ms: 900,
+    });
+  });
+
+  it('reports undefined timings for a run with no transitions, not zero', () => {
+    const s = new RunMetrics(0).summary();
+    expect(s.time_to_first_task_ms).toBeUndefined();
+    expect(s.time_to_first_completion_ms).toBeUndefined();
+    expect(s.max_gap_ms).toBeUndefined();
+  });
+
+  it('a single started-but-unfinished task reports a real zero gap and no completion', () => {
+    const m = new RunMetrics(0);
+    m.recordStart(50);
+    const s = m.summary();
+    expect(s.time_to_first_task_ms).toBe(50);
+    expect(s.time_to_first_completion_ms).toBeUndefined();
+    expect(s.max_gap_ms).toBe(0); // one visible transition → genuine 0, not undefined
+  });
+
+  it('counts a retry stall (start to re-start) as silence', () => {
+    const m = new RunMetrics(0);
+    m.recordStart(0);
+    // the task ended without reporting and was requeued (invisible), then
+    // re-started 5s later — that stall is a silence the user sees.
+    m.recordStart(5000);
+    expect(m.summary().max_gap_ms).toBe(5000);
+  });
+
+  it('treats skip and fail as visible transitions for gap tracking', () => {
+    const m = new RunMetrics(0);
+    m.recordStart(0);
+    m.recordTerminal(2000); // skip or fail, gap 2000
+    m.recordStart(2500); // gap 500
+    expect(m.summary().max_gap_ms).toBe(2000);
+  });
+});
diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts
index 31df7372..6353d8c8 100644
--- a/src/lib/programs/orchestrator/orchestrator-runner.ts
+++ b/src/lib/programs/orchestrator/orchestrator-runner.ts
@@ -20,7 +20,7 @@ import {
 } from '../../agent/agent-interface';
 import { OutroKind, type WizardSession } from '../../wizard-session';
 import { detectNodePackageManagers } from '../../detection/package-manager';
-import { installSkillById } from '../../wizard-tools';
+import { installSkillById, fetchSkillMenu } from '../../wizard-tools';
 import { getUI } from '../../../ui';
 import { analytics } from '../../../utils/analytics';
 import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides';
@@ -35,6 +35,7 @@ import {
   type QueuedTask,
 } from './queue';
 import { drainQueue, type RunTask } from './executor';
+import { RunMetrics } from './run-metrics';
 import {
   agentRunTools,
   assembleSeedPrompt,
@@ -74,6 +75,27 @@ function sessionRunOptions(session: WizardSession): WizardRunOptions {
   };
 }
 
+/**
+ * The framework reference is the full `integration` skill. `session.skillId` is
+ * the bare framework (e.g. `django`), but the skill menu ids it as
+ * `integration-<variant>`. Resolve to the menu id: exact `integration-<framework>`
+ * (the 1:1 frameworks — django, python, flask, …), else the first granular variant
+ * under it (e.g. `integration-nextjs-app-router`). Undefined when none exists.
+ */
+async function resolveReferenceSkillId(
+  skillsBaseUrl: string,
+  framework: string,
+): Promise<string | undefined> {
+  const menu = await fetchSkillMenu(skillsBaseUrl);
+  if (!menu) return undefined;
+  const ids = Object.values(menu.categories)
+    .flat()
+    .map((s) => s.id);
+  const exact = `integration-${framework}`;
+  if (ids.includes(exact)) return exact;
+  return ids.find((id) => id.startsWith(`integration-${framework}-`));
+}
+
 export async function runOrchestrator(
   session: WizardSession,
   programConfig: ProgramConfig,
@@ -107,8 +129,7 @@ export async function runOrchestrator(
   // queue transitions, with the resolved model so cheap work is attributable
   // to cheap models.
   const runStartMs = Date.now();
-  let firstStartMs: number | undefined;
-  let lastStartMs: number | undefined;
+  const metrics = new RunMetrics(runStartMs);
   const durationMs = (t: QueuedTask) =>
     t.startedAt && t.finishedAt
       ? Date.parse(t.finishedAt) - Date.parse(t.startedAt)
@@ -129,31 +150,28 @@ export async function runOrchestrator(
             dynamic: task.enqueuedBy !== 'orchestrator',
           });
           break;
-        case 'start': {
-          const now = Date.now();
+        case 'start':
           analytics.wizardCapture('orchestrator task started', {
             ...base,
-            ms_since_run_start: now - runStartMs,
-            gap_since_prev_start_ms:
-              lastStartMs === undefined ? undefined : now - lastStartMs,
+            ...metrics.recordStart(Date.now()),
           });
-          firstStartMs ??= now;
-          lastStartMs = now;
           break;
-        }
         case 'complete':
+          metrics.recordComplete(Date.now());
           analytics.wizardCapture('orchestrator task completed', {
             ...base,
             duration_ms: durationMs(task),
           });
           break;
         case 'skip':
+          metrics.recordTerminal(Date.now());
           analytics.wizardCapture('orchestrator task skipped', {
             ...base,
             duration_ms: durationMs(task),
           });
           break;
         case 'fail':
+          metrics.recordTerminal(Date.now());
           analytics.wizardCapture('orchestrator task failed', {
             ...base,
             duration_ms: durationMs(task),
@@ -172,9 +190,12 @@ export async function runOrchestrator(
   // skill — only the example file is read, when the agent's prompt points at it.
   let examplePath: string | undefined;
   let commandmentsPath: string | undefined;
-  if (session.skillId) {
+  const referenceSkillId = session.skillId
+    ? await resolveReferenceSkillId(boot.skillsBaseUrl, session.skillId)
+    : undefined;
+  if (referenceSkillId) {
     const ref = await installSkillById(
-      session.skillId,
+      referenceSkillId,
       session.installDir,
       boot.skillsBaseUrl,
       path.join(QUEUE_DIR_NAME, 'reference'),
@@ -189,8 +210,14 @@ export async function runOrchestrator(
         commandmentsPath = commandments;
       }
     } else {
-      logToFile(`[orchestrator] reference example unavailable: ${ref.kind}`);
+      logToFile(
+        `[orchestrator] reference unavailable: ${ref.kind} (${referenceSkillId})`,
+      );
     }
+  } else if (session.skillId) {
+    logToFile(
+      `[orchestrator] no integration skill for framework "${session.skillId}"`,
+    );
   }
 
   // The client injects the basics (project context + the I/O contract) around
@@ -353,11 +380,20 @@ export async function runOrchestrator(
   try {
     await drainQueue(store, runTask);
   } finally {
-    // Success or failure, the installed task instructions never outlive the run.
-    rmSync(path.join(session.installDir, taskSkillsRoot), {
-      recursive: true,
-      force: true,
-    });
+    // Success or failure, no run artifact outlives the run — wipe the whole
+    // cache folder (queue, handoffs, reference example, installed task
+    // instructions). The .DELETE-ME.md inside is the fallback if we don't.
+    try {
+      rmSync(path.join(session.installDir, QUEUE_DIR_NAME), {
+        recursive: true,
+        force: true,
+      });
+    } catch (err) {
+      analytics.captureException(
+        err instanceof Error ? err : new Error(String(err)),
+        { step: 'orchestrator_cache_cleanup' },
+      );
+    }
   }
 
   renderQueue();
@@ -372,8 +408,11 @@ export async function runOrchestrator(
     tasks_failed: summary.failed,
     tasks_skipped: summary.skipped,
     total_duration_ms: Date.now() - runStartMs,
-    time_to_first_task_ms:
-      firstStartMs === undefined ? undefined : firstStartMs - runStartMs,
+    ...metrics.summary(),
+    dynamic_enqueue_count: store
+      .list()
+      .filter((t) => t.enqueuedBy !== 'orchestrator').length,
+    retried_task_count: store.list().filter((t) => t.attempts > 1).length,
   });
 
   // The build step flags any unresolved conflict in its handoff; surface the
diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts
index 19545d9d..4aa7c368 100644
--- a/src/lib/programs/orchestrator/queue.ts
+++ b/src/lib/programs/orchestrator/queue.ts
@@ -6,9 +6,10 @@
  * returns every pending task whose dependencies are satisfied, and how many of
  * those run at once is decided by the task graph, not the queue.
  *
- * Every transition rewrites `<installDir>/.posthog-wizard/queue.json`, a small
- * file holding the whole queue, handoffs included. Today it is the run's
- * log and the report's source; later it is the resume point.
+ * Every transition rewrites `<installDir>/.posthog-wizard-cache/queue.json`, a
+ * small file holding the whole queue, handoffs included. It is the run's log
+ * and the report's source. The whole cache folder is run-scoped and wiped when
+ * the run ends.
  */
 import * as fs from 'fs';
 import * as path from 'path';
@@ -79,13 +80,22 @@ export interface EnqueueInput {
   enqueuedBy?: string;
 }
 
-export const QUEUE_DIR_NAME = '.posthog-wizard';
+export const QUEUE_DIR_NAME = '.posthog-wizard-cache';
 const DEFAULT_MAX_ATTEMPTS = 2;
 
 function nowIso(): string {
   return new Date().toISOString();
 }
 
+/** Dropped in the cache folder so an orphaned copy explains itself. */
+const DELETE_ME_FILE = '.DELETE-ME.md';
+const DELETE_ME_BODY = `# Safe to delete
+
+This folder contains run artifacts from the PostHog Wizard. This should have
+been deleted if the Wizard has finished running. If this wasn't deleted for
+some reason, you can safely delete the entire \`${QUEUE_DIR_NAME}/\` folder.
+`;
+
 /** Every queue transition, in the order it is reflected. */
 export type TransitionEvent =
   | 'enqueue'
@@ -120,6 +130,7 @@ export class QueueStore {
     const dir = path.join(installDir, QUEUE_DIR_NAME);
     this.queuePath = path.join(dir, 'queue.json');
     fs.mkdirSync(dir, { recursive: true });
+    fs.writeFileSync(path.join(dir, DELETE_ME_FILE), DELETE_ME_BODY);
   }
 
   // ── Reads ───────────────────────────────────────────────────────────
diff --git a/src/lib/programs/orchestrator/run-metrics.ts b/src/lib/programs/orchestrator/run-metrics.ts
new file mode 100644
index 00000000..8ea82415
--- /dev/null
+++ b/src/lib/programs/orchestrator/run-metrics.ts
@@ -0,0 +1,85 @@
+/**
+ * Responsiveness metrics for an orchestrator run.
+ *
+ * Responsiveness is the experiment's headline: how quickly the user sees the
+ * first progress, and whether progress stays steady (no long silences). The math
+ * is accumulated from queue transitions but kept here, pure and time-injected, so
+ * it is unit-testable away from the runner. Wall-clock times are passed in as
+ * milliseconds; the caller owns the clock.
+ */
+
+export interface RunMetricsSummary {
+  /** Run start → first task started. */
+  time_to_first_task_ms?: number;
+  /** Run start → first task completed (the first visible "done"). */
+  time_to_first_completion_ms?: number;
+  /** Longest silence between two consecutive user-visible transitions. */
+  max_gap_ms?: number;
+}
+
+/** The per-event timing the `orchestrator task started` event reports. */
+export interface StartTiming {
+  ms_since_run_start: number;
+  gap_since_prev_start_ms?: number;
+}
+
+export class RunMetrics {
+  private firstStartMs?: number;
+  private lastStartMs?: number;
+  private firstCompleteMs?: number;
+  private lastVisibleMs?: number;
+  private maxGapMs = 0;
+
+  constructor(private readonly runStartMs: number) {}
+
+  /** A task started. Returns the per-start-event timing for the start event. */
+  recordStart(nowMs: number): StartTiming {
+    const timing: StartTiming = {
+      ms_since_run_start: nowMs - this.runStartMs,
+      gap_since_prev_start_ms:
+        this.lastStartMs === undefined ? undefined : nowMs - this.lastStartMs,
+    };
+    this.firstStartMs ??= nowMs;
+    this.lastStartMs = nowMs;
+    this.markVisible(nowMs);
+    return timing;
+  }
+
+  /** A task completed. */
+  recordComplete(nowMs: number): void {
+    this.firstCompleteMs ??= nowMs;
+    this.markVisible(nowMs);
+  }
+
+  /** A task reached a terminal non-complete state the user sees (skip/fail). */
+  recordTerminal(nowMs: number): void {
+    this.markVisible(nowMs);
+  }
+
+  /**
+   * The run-level responsiveness summary. Timings are `undefined` when the
+   * relevant transition never happened (e.g. a run that started no task), so a
+   * no-task run stays distinguishable from a genuine zero.
+   */
+  summary(): RunMetricsSummary {
+    return {
+      time_to_first_task_ms:
+        this.firstStartMs === undefined
+          ? undefined
+          : this.firstStartMs - this.runStartMs,
+      time_to_first_completion_ms:
+        this.firstCompleteMs === undefined
+          ? undefined
+          : this.firstCompleteMs - this.runStartMs,
+      max_gap_ms: this.lastVisibleMs === undefined ? undefined : this.maxGapMs,
+    };
+  }
+
+  /** requeue is not user-visible, so a retry stall counts as silence here. */
+  private markVisible(nowMs: number): void {
+    if (this.lastVisibleMs !== undefined) {
+      this.maxGapMs = Math.max(this.maxGapMs, nowMs - this.lastVisibleMs);
+    }
+    this.lastVisibleMs = nowMs;
+  }
+}
diff --git a/src/ui/tui/primitives/LogViewer.tsx b/src/ui/tui/primitives/LogViewer.tsx
index 9fa02cd4..8277802c 100644
--- a/src/ui/tui/primitives/LogViewer.tsx
+++ b/src/ui/tui/primitives/LogViewer.tsx
@@ -15,8 +15,10 @@ import { useState, useEffect } from 'react';
 import * as fs from 'fs';
 import { useStdoutDimensions } from '@ui/tui/hooks/useStdoutDimensions';
 
-/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar + tab bar */
-const CHROME_ROWS = 8;
+/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar +
+ *  tab bar, with a couple rows of headroom so the tail never crowds the status
+ *  bar below it. */
+const CHROME_ROWS = 10;
 
 /** Bytes read from the end of the log per refresh — large enough to contain
  *  any practical visible window of lines, small enough to allocate cheaply. */

From b2842fab853ef9151db7fe85402969f944e8781d Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:23:42 -0400
Subject: [PATCH 12/12] feat(agent-runner): don't abort non-interactive runs on
 a health-check outage (#678)

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
---
 src/lib/agent/agent-runner.ts | 18 ++++++++++++------
 src/ui/logging-ui.ts          |  6 ++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts
index 0f3269c4..b202a3d0 100644
--- a/src/lib/agent/agent-runner.ts
+++ b/src/lib/agent/agent-runner.ts
@@ -55,6 +55,7 @@ import {
 } from '@utils/debug';
 import { createBenchmarkPipeline } from '@lib/middleware/benchmark';
 import { wizardAbort, WizardError, registerCleanup } from '@utils/wizard-abort';
+import { isNonInteractiveEnvironment } from '@utils/environment';
 import { formatScanReport, writeScanReport } from '@lib/yara-hooks';
 import { detectNodePackageManagers } from '@lib/detection/package-manager';
 import type { PackageManagerDetector } from '@lib/detection/package-manager';
@@ -279,12 +280,17 @@ async function bootstrapProgram(
 
       await getUI().showBlockingOutage(readiness);
 
-      await wizardAbort({
-        message:
-          'Cannot start — external services are down:\n' +
-          blockingLabels.map((l) => `  - ${l}`).join('\n') +
-          '\n\nPlease try again later.',
-      });
+      // The TUI lets the user continue past an outage; non-interactive runs
+      // (CI) do the same automatically — the degraded services are reported
+      // above, but we proceed rather than aborting on a transient upstream blip.
+      if (!isNonInteractiveEnvironment()) {
+        await wizardAbort({
+          message:
+            'Cannot start — external services are down:\n' +
+            blockingLabels.map((l) => `  - ${l}`).join('\n') +
+            '\n\nPlease try again later.',
+        });
+      }
     } else if (readiness.decision === WizardReadiness.YesWithWarnings) {
       getUI().setReadinessWarnings(readiness);
     }
diff --git a/src/ui/logging-ui.ts b/src/ui/logging-ui.ts
index 9ae0a2ee..40a32523 100644
--- a/src/ui/logging-ui.ts
+++ b/src/ui/logging-ui.ts
@@ -114,7 +114,7 @@ export class LoggingUI implements WizardUI {
   }
 
   showBlockingOutage(result: WizardReadinessResult): Promise<void> {
-    console.log(`▲  Service health issues detected — blocking outage.`);
+    console.log(`▲  Service health issues detected.`);
     const blockingKeys = getBlockingServiceKeys(result.health);
     if (blockingKeys.length > 0) {
       console.log(`│`);
@@ -131,7 +131,9 @@ export class LoggingUI implements WizardUI {
     for (const reason of result.reasons) {
       console.log(`│  ${reason}`);
     }
-    console.log(`│  The wizard cannot start while these services are down.`);
+    console.log(
+      `│  Continuing anyway — health checks are advisory in non-interactive runs.`,
+    );
     return Promise.resolve();
   }