From 1376a78f514f818dc87fa93745d2a2da62749f5c Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Mon, 8 Jun 2026 21:50:30 -0400 Subject: [PATCH 01/12] feat(orchestrator): flag gating + shared bootstrap extraction Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agent/__tests__/variant-gating.test.ts | 36 ++++++ src/lib/agent/agent-interface.ts | 12 ++ src/lib/agent/agent-runner.ts | 111 +++++++++++++++--- src/lib/constants.ts | 2 + 4 files changed, 143 insertions(+), 18 deletions(-) create mode 100644 src/lib/agent/__tests__/variant-gating.test.ts diff --git a/src/lib/agent/__tests__/variant-gating.test.ts b/src/lib/agent/__tests__/variant-gating.test.ts new file mode 100644 index 00000000..699bd096 --- /dev/null +++ b/src/lib/agent/__tests__/variant-gating.test.ts @@ -0,0 +1,36 @@ +import { + buildWizardMetadata, + isOrchestratorEnabled, +} from '@lib/agent/agent-interface'; + +describe('isOrchestratorEnabled', () => { + it('is true only when the wizard-orchestrator flag is true', () => { + expect(isOrchestratorEnabled({ 'wizard-orchestrator': 'true' })).toBe(true); + }); + + it('is false when the flag is false, another flag, or absent', () => { + expect(isOrchestratorEnabled({ 'wizard-orchestrator': 'false' })).toBe( + false, + ); + expect(isOrchestratorEnabled({ 'wizard-variant': 'orchestrator' })).toBe( + false, + ); + expect(isOrchestratorEnabled({})).toBe(false); + expect(isOrchestratorEnabled()).toBe(false); + }); +}); + +describe('buildWizardMetadata', () => { + it('selects a known variant header from the flag', () => { + expect(buildWizardMetadata({ 'wizard-variant': 'subagents' })).toEqual({ + VARIANT: 'subagents', + }); + }); + + it('falls back to the base variant for unknown or missing flags', () => { + expect(buildWizardMetadata({ 'wizard-variant': 'nope' })).toEqual({ + VARIANT: 'base', + }); + expect(buildWizardMetadata({})).toEqual({ VARIANT: 'base' }); + }); +}); diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts index aefb5bc4..46f375a2 100644 --- a/src/lib/agent/agent-interface.ts +++ b/src/lib/agent/agent-interface.ts @@ -15,6 +15,7 @@ import { POSTHOG_PROPERTY_HEADER_PREFIX, WIZARD_VARIANT_FLAG_KEY, WIZARD_VARIANTS, + WIZARD_ORCHESTRATOR_FLAG_KEY, WIZARD_USER_AGENT, } from '@lib/constants'; import { @@ -245,6 +246,17 @@ export function buildWizardMetadata( return { ...variant }; } +/** + * Whether this run uses the experimental task-queue orchestrator. Gated by the + * boolean `wizard-orchestrator` feature flag, targeted to the user in the wizard's + * analytics project. + */ +export function isOrchestratorEnabled( + flags: Record = {}, +): boolean { + return flags[WIZARD_ORCHESTRATOR_FLAG_KEY] === 'true'; +} + /** * Build env for the SDK subprocess: process.env plus ANTHROPIC_CUSTOM_HEADERS, which always * includes `x-posthog-use-bedrock-fallback: true` so the LLM gateway falls back to Bedrock on diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 4af134ff..77c2f9ce 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -9,9 +9,11 @@ * - What MCP servers and package manager detector to use * - What happens after the agent completes * - * The pipeline itself is fixed: - * init → health check → settings → OAuth → [skill install] → - * agent init → prompt → run → errors → [postRun] → outro + * The pipeline runs a shared bootstrap (logging, health check, settings, OAuth, + * flags, MCP url), then forks. The `orchestrator` variant routes to the + * experimental task-queue runner. Every other variant runs the fixed linear + * pipeline: + * [skill install] → agent init → prompt → run → errors → [postRun] → outro */ import { @@ -53,7 +55,7 @@ import { getSkillsBaseUrl } from '@lib/constants'; import { runtimeEnv } from '@env'; import { installSkillById, type InstallSkillResult } from '@lib/wizard-tools'; import { createWizardAskBridge } from '@lib/wizard-ask-bridge'; -import type { WizardRunOptions } from '@utils/types'; +import type { WizardRunOptions, CloudRegion } from '@utils/types'; import type { ProgramConfig } from '@lib/programs/program-step'; import { assemblePrompt, type PromptContext } from './agent-prompt'; @@ -108,7 +110,7 @@ export interface ProgramRun { buildOutroData?: ( session: WizardSession, credentials: Credentials, - cloudRegion: import('@utils/types').CloudRegion | undefined, + cloudRegion: CloudRegion | undefined, ) => WizardSession['outroData']; /** * Per-run cap on `wizard_ask` invocations. Defaults to 10. The 4th call @@ -124,6 +126,23 @@ export interface ProgramRun { askTimeoutMs?: number; } +/** + * Result of the shared bootstrap, consumed by both the linear and the + * orchestrator arm. Credentials, role, and user are already applied to the + * session by `bootstrapProgram`; this carries the values both arms still need. + */ +export interface BootstrapResult { + skillsBaseUrl: string; + projectApiKey: Credentials['projectApiKey']; + host: Credentials['host']; + accessToken: Credentials['accessToken']; + projectId: Credentials['projectId']; + cloudRegion: CloudRegion; + mcpUrl: string; + wizardFlags: Record; + wizardMetadata: Record; +} + // ── Helpers ────────────────────────────────────────────────────────── /** @@ -179,16 +198,31 @@ export async function runAgent( /** * Run a program's agent pipeline. * - * This is the single execution path for all programs — both skill-based - * (revenue analytics) and framework-based (core integration). The - * `ProgramRun` controls what varies between them; `programConfig` carries - * the program-level static metadata (tool allow/disallow lists, etc.). + * Runs the shared bootstrap, then forks on the `wizard-variant` flag. The + * `orchestrator` variant routes to the experimental task-queue runner; every + * other variant runs the linear pipeline. */ export async function runProgram( session: WizardSession, config: ProgramRun, programConfig: ProgramConfig, ): Promise { + const boot = await bootstrapProgram(session, config, programConfig); + + return runLinearProgram(session, config, programConfig, boot); +} + +/** + * Shared setup for both arms: logging, health check, settings conflicts, OAuth + * and credentials, then the feature flags, variant metadata, and MCP url. Sets + * `session.credentials`, role, and user as a side effect. Returns the values the + * arms still need. + */ +async function bootstrapProgram( + session: WizardSession, + config: ProgramRun, + programConfig: ProgramConfig, +): Promise { // 1. Init logging + debug initLogFile(); session.skillId = config.skillId ?? config.integrationLabel; @@ -310,10 +344,60 @@ export async function runProgram( // install and agent start, so no source leaves the machine. The screen // alone is cosmetic; this await is the actual gate. Resolves // immediately when the program declared requiresAi: false or in CI. + // In bootstrapProgram so both the linear and orchestrator arms gate. logToFile('[agent-runner] checking AI opt-in gate'); await getUI().waitForAiOptIn(); logToFile('[agent-runner] AI opt-in gate cleared'); + // Feature flags, variant metadata, and MCP url. Both arms need these, and the + // fork decision reads the flags. + const wizardFlags = await analytics.getAllFlagsForWizard(); + const wizardMetadata = buildWizardMetadata(wizardFlags); + + const mcpUrl = session.localMcp + ? 'http://localhost:8787/mcp' + : runtimeEnv('MCP_URL') || + (cloudRegion === 'eu' + ? 'https://mcp-eu.posthog.com/mcp' + : 'https://mcp.posthog.com/mcp'); + + return { + skillsBaseUrl, + projectApiKey, + host, + accessToken, + projectId, + cloudRegion, + mcpUrl, + wizardFlags, + wizardMetadata, + }; +} + +/** + * The linear pipeline. Single execution path for all non-orchestrator programs, + * both skill-based (revenue analytics) and framework-based (core integration). + * The `ProgramRun` controls what varies between them; `programConfig` carries the + * program-level static metadata (tool allow/disallow lists, etc.). + */ +async function runLinearProgram( + session: WizardSession, + config: ProgramRun, + programConfig: ProgramConfig, + boot: BootstrapResult, +): Promise { + const { + skillsBaseUrl, + projectApiKey, + host, + accessToken, + projectId, + cloudRegion, + mcpUrl, + wizardFlags, + wizardMetadata, + } = boot; + // 5. Skill install (if skillId provided) let skillPath: string | undefined; if (config.skillId) { @@ -333,15 +417,6 @@ export async function runProgram( // 6. Initialize agent const spinner = getUI().spinner(); - const wizardFlags = await analytics.getAllFlagsForWizard(); - const wizardMetadata = buildWizardMetadata(wizardFlags); - - const mcpUrl = session.localMcp - ? 'http://localhost:8787/mcp' - : runtimeEnv('MCP_URL') || - (cloudRegion === 'eu' - ? 'https://mcp-eu.posthog.com/mcp' - : 'https://mcp.posthog.com/mcp'); const restoreSettings = () => restoreClaudeSettings(session.installDir); getUI().onEnterScreen('outro', restoreSettings); diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 055df06d..09eae21d 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -176,6 +176,8 @@ export const WIZARD_INTERACTION_EVENT_NAME = 'wizard interaction'; export const WIZARD_REMARK_EVENT_NAME = 'wizard remark'; /** Feature flag key whose value selects a variant from WIZARD_VARIANTS. */ export const WIZARD_VARIANT_FLAG_KEY = 'wizard-variant'; +/** Boolean feature flag that routes a run to the experimental orchestrator runner. */ +export const WIZARD_ORCHESTRATOR_FLAG_KEY = 'wizard-orchestrator'; /** Feature flag key that gates the intro-screen "Tools" menu. */ export const WIZARD_TOOLS_MENU_FLAG_KEY = 'wizard-tools-menu'; /** Variant key -> metadata for wizard run (VARIANT flag selects which entry to use). */ From c94739ed4267ff26ebc157814417faec4799b5d1 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:52:05 -0400 Subject: [PATCH 02/12] feat(orchestrator): in-memory queue + disk persistence (QueueStore) (#607) Co-authored-by: Claude Opus 4.8 (1M context) --- .../orchestrator/__tests__/queue.test.ts | 135 ++++++++++ src/lib/programs/orchestrator/queue.ts | 239 ++++++++++++++++++ src/lib/wizard-tools.ts | 23 +- src/utils/atomic-ledger.ts | 29 +++ 4 files changed, 406 insertions(+), 20 deletions(-) create mode 100644 src/lib/programs/orchestrator/__tests__/queue.test.ts create mode 100644 src/lib/programs/orchestrator/queue.ts create mode 100644 src/utils/atomic-ledger.ts diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts new file mode 100644 index 00000000..4a18dee2 --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts @@ -0,0 +1,135 @@ +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { + QueueStore, + type QueueFile, + type TaskHandoff, +} from '@lib/programs/orchestrator/queue'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-test-')); +} + +describe('QueueStore', () => { + let dir: string; + let q: QueueStore; + + beforeEach(() => { + dir = tmpDir(); + q = new QueueStore(dir, 'run-1'); + }); + + afterEach(() => { + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('enqueues a pending task with defaults', () => { + const t = q.enqueue({ type: 'install' }); + expect(t.status).toBe('pending'); + expect(t.attempts).toBe(0); + expect(t.maxAttempts).toBe(2); + expect(t.enqueuedBy).toBe('orchestrator'); + expect(t.dependsOn).toEqual([]); + expect(q.list()).toHaveLength(1); + }); + + it('only marks a task runnable once its dependencies are done', () => { + const a = q.enqueue({ type: 'install' }); + const b = q.enqueue({ type: 'init', dependsOn: [a.id] }); + + expect(q.nextRunnable().map((t) => t.id)).toEqual([a.id]); + + q.start(a.id); + q.complete(a.id); + expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]); + }); + + it('returns every runnable task; the graph alone decides parallelism', () => { + const a = q.enqueue({ type: 'install' }); + const b = q.enqueue({ type: 'init' }); + q.enqueue({ type: 'capture', dependsOn: [a.id, b.id] }); + + // Both independent tasks are runnable at once; the dependent one is not. + expect( + q + .nextRunnable() + .map((t) => t.id) + .sort(), + ).toEqual([a.id, b.id].sort()); + + q.start(a.id); + // An in-progress task is no longer offered. + expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]); + }); + + it('treats a skipped dependency as satisfied', () => { + const a = q.enqueue({ type: 'install' }); + const b = q.enqueue({ type: 'init', dependsOn: [a.id] }); + + q.start(a.id); + q.skip(a.id); + expect(q.nextRunnable().map((t) => t.id)).toEqual([b.id]); + }); + + it('start increments attempts and supports within-run retry while attempts remain', () => { + const t = q.enqueue({ type: 'install', maxAttempts: 2 }); + q.start(t.id); + expect(q.get(t.id)?.attempts).toBe(1); + + q.fail(t.id, { type: 'API_ERROR', message: 'boom' }); + expect(q.get(t.id)?.status).toBe('failed'); + + // Retry: attempts (1) < maxAttempts (2), so requeue and run again. + q.requeue(t.id); + expect(q.get(t.id)?.status).toBe('pending'); + q.start(t.id); + expect(q.get(t.id)?.attempts).toBe(2); + }); + + it('completing a task records and reads back a structured handoff', () => { + const t = q.enqueue({ type: 'install' }); + const handoff: TaskHandoff = { + goals: 'install the sdk', + did: 'added posthog-js', + forNextAgent: 'env vars not set yet', + filesTouched: ['package.json'], + }; + q.start(t.id); + q.complete(t.id, handoff); + + expect(q.get(t.id)?.status).toBe('done'); + expect(q.readHandoff(t.id)).toEqual(handoff); + expect(q.readHandoffsByType('install')).toEqual([handoff]); + }); + + it('is drained when a pending task is blocked by a failed dependency', () => { + const a = q.enqueue({ type: 'install' }); + q.enqueue({ type: 'init', dependsOn: [a.id] }); + + expect(q.isDrained()).toBe(false); + q.start(a.id); + q.fail(a.id, { type: 'API_ERROR', message: 'boom' }); + + // init can never run now, and nothing is in progress. + expect(q.nextRunnable()).toHaveLength(0); + expect(q.isDrained()).toBe(true); + }); + + it('reflects every transition to queue.json, handoffs included', () => { + const a = q.enqueue({ type: 'install' }); + q.start(a.id); + q.complete(a.id, { + goals: 'g', + did: 'd', + forNextAgent: 'n', + }); + + const file = JSON.parse(fs.readFileSync(q.queuePath, 'utf8')) as QueueFile; + expect(file.version).toBe(1); + expect(file.runId).toBe('run-1'); + expect(file.tasks).toHaveLength(1); + expect(file.tasks[0].status).toBe('done'); + expect(file.tasks[0].handoff?.did).toBe('d'); + }); +}); diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts new file mode 100644 index 00000000..5f62c718 --- /dev/null +++ b/src/lib/programs/orchestrator/queue.ts @@ -0,0 +1,239 @@ +/** + * The orchestrator task queue. + * + * In memory, synchronous, single-owner: one Node process drives the run, so + * there is no locking. The queue imposes no execution policy — `nextRunnable` + * returns every pending task whose dependencies are satisfied, and how many of + * those run at once is decided by the task graph, not the queue. + * + * Every transition rewrites `/.posthog-wizard/queue.json`, a small + * file holding the whole queue, handoffs included. Today it is the run's + * log and the report's source; later it is the resume point. + */ +import * as fs from 'fs'; +import * as path from 'path'; +import { randomUUID } from 'crypto'; +import { writeJsonAtomic } from '../../../utils/atomic-ledger'; + +export const TaskStatus = { + Pending: 'pending', + Running: 'running', + Done: 'done', + Skipped: 'skipped', + Failed: 'failed', +} as const; + +export type TaskStatus = (typeof TaskStatus)[keyof typeof TaskStatus]; + +export interface QueuedTask { + id: string; + type: string; + status: TaskStatus; + dependsOn: string[]; + inputs: Record; + model?: string; + attempts: number; + maxAttempts: number; + /** The structured handoff the task reported on completion. */ + handoff?: TaskHandoff; + /** 'orchestrator' for seeded tasks, or the id of the task that enqueued this one. */ + enqueuedBy: string; + createdAt: string; + startedAt?: string; + finishedAt?: string; + error?: { type: string; message: string }; +} + +export interface QueueFile { + version: 1; + runId: string; + tasks: QueuedTask[]; +} + +/** The structured handoff a task leaves for the next agent. */ +export interface TaskHandoff { + goals: string; + did: string; + forNextAgent: string; + filesTouched?: string[]; +} + +export interface EnqueueInput { + type: string; + inputs?: Record; + dependsOn?: string[]; + model?: string; + maxAttempts?: number; + enqueuedBy?: string; +} + +export const QUEUE_DIR_NAME = '.posthog-wizard'; +const DEFAULT_MAX_ATTEMPTS = 2; + +function nowIso(): string { + return new Date().toISOString(); +} + +export class QueueStore { + private tasks: QueuedTask[] = []; + + readonly runId: string; + readonly queuePath: string; + + constructor(installDir: string, runId: string) { + this.runId = runId; + const dir = path.join(installDir, QUEUE_DIR_NAME); + this.queuePath = path.join(dir, 'queue.json'); + fs.mkdirSync(dir, { recursive: true }); + } + + // ── Reads ─────────────────────────────────────────────────────────── + + list(): readonly QueuedTask[] { + return this.tasks; + } + + get(id: string): QueuedTask | undefined { + return this.tasks.find((t) => t.id === id); + } + + /** + * Every pending task whose dependencies are all satisfied (`done` or + * `skipped`). A skipped dependency does not block downstream work. + */ + nextRunnable(): QueuedTask[] { + const doneIds = new Set( + this.tasks + .filter( + (t) => + t.status === TaskStatus.Done || t.status === TaskStatus.Skipped, + ) + .map((t) => t.id), + ); + return this.tasks.filter( + (t) => + t.status === TaskStatus.Pending && + t.dependsOn.every((d) => doneIds.has(d)), + ); + } + + /** + * True when no task is in progress and none can be started. Either everything + * is terminal, or the only pending tasks are blocked by a failed dependency. + */ + isDrained(): boolean { + if (this.tasks.some((t) => t.status === TaskStatus.Running)) return false; + return this.nextRunnable().length === 0; + } + + summary(): Record & { total: number } { + const counts: Record = { + [TaskStatus.Pending]: 0, + [TaskStatus.Running]: 0, + [TaskStatus.Done]: 0, + [TaskStatus.Skipped]: 0, + [TaskStatus.Failed]: 0, + }; + for (const t of this.tasks) counts[t.status] += 1; + return { ...counts, total: this.tasks.length }; + } + + readHandoff(id: string): TaskHandoff | null { + return this.get(id)?.handoff ?? null; + } + + /** Handoffs of completed tasks of a given type, oldest first. */ + readHandoffsByType(type: string): TaskHandoff[] { + return this.tasks + .filter((t) => t.type === type && t.handoff) + .map((t) => t.handoff as TaskHandoff); + } + + // ── Transitions (each one reflected to queue.json) ────────────────── + + enqueue(input: EnqueueInput): QueuedTask { + const task: QueuedTask = { + id: randomUUID(), + type: input.type, + status: TaskStatus.Pending, + dependsOn: input.dependsOn ?? [], + inputs: input.inputs ?? {}, + model: input.model, + attempts: 0, + maxAttempts: input.maxAttempts ?? DEFAULT_MAX_ATTEMPTS, + enqueuedBy: input.enqueuedBy ?? 'orchestrator', + createdAt: nowIso(), + }; + this.tasks.push(task); + this.reflect(); + return task; + } + + start(id: string): QueuedTask { + const t = this.require(id); + t.status = TaskStatus.Running; + t.startedAt = nowIso(); + t.attempts += 1; + this.reflect(); + return t; + } + + complete(id: string, handoff?: TaskHandoff): QueuedTask { + return this.finish(id, TaskStatus.Done, handoff); + } + + /** Terminal: the agent could not do the task. Not done, not failed. */ + skip(id: string, handoff?: TaskHandoff): QueuedTask { + return this.finish(id, TaskStatus.Skipped, handoff); + } + + fail( + id: string, + error: { type: string; message: string }, + handoff?: TaskHandoff, + ): QueuedTask { + const t = this.require(id); + t.error = error; + return this.finish(id, TaskStatus.Failed, handoff); + } + + /** Put a failed/in-progress task back to pending for a retry within the run. */ + requeue(id: string): QueuedTask { + const t = this.require(id); + t.status = TaskStatus.Pending; + t.startedAt = undefined; + t.finishedAt = undefined; + this.reflect(); + return t; + } + + // ── Internals ─────────────────────────────────────────────────────── + + private finish( + id: string, + status: 'done' | 'skipped' | 'failed', + handoff?: TaskHandoff, + ): QueuedTask { + const t = this.require(id); + if (handoff) t.handoff = handoff; + t.status = status; + t.finishedAt = nowIso(); + this.reflect(); + return t; + } + + private reflect(): void { + const file: QueueFile = { + version: 1, + runId: this.runId, + tasks: this.tasks, + }; + writeJsonAtomic(this.queuePath, file); + } + + private require(id: string): QueuedTask { + const t = this.get(id); + if (!t) throw new Error(`No task ${id} in the queue`); + return t; + } +} diff --git a/src/lib/wizard-tools.ts b/src/lib/wizard-tools.ts index 7b2f6693..8d2f8d37 100644 --- a/src/lib/wizard-tools.ts +++ b/src/lib/wizard-tools.ts @@ -16,6 +16,7 @@ import { z } from 'zod'; import { logToFile } from '@utils/debug'; import { analytics } from '@utils/analytics'; import { skillTmpPath } from '@utils/paths'; +import { writeJsonAtomic, makeMutex } from '@utils/atomic-ledger'; import type { PackageManagerDetector } from './detection/package-manager'; import { AUDIT_CHECKS_FILE, @@ -389,14 +390,9 @@ const auditUpdateSchema = z.object({ details: z.string().optional(), }); -/** - * Atomically write JSON: write to .tmp then rename. The rename is what bumps - * the file's mtime, which is what the UI's file watcher polls on. - */ +/** Atomically write the audit ledger. Thin typed wrapper over writeJsonAtomic. */ function writeLedgerAtomic(targetPath: string, checks: AuditCheck[]): void { - const tmpPath = `${targetPath}.tmp`; - fs.writeFileSync(tmpPath, JSON.stringify(checks, null, 2), 'utf8'); - fs.renameSync(tmpPath, targetPath); + writeJsonAtomic(targetPath, checks); } /** @@ -495,19 +491,6 @@ function appendAuditChecksToLedger( return { ok: true, added: additions.length }; } -/** - * Single async mutex shared by audit tools — guarantees a read-modify-write - * cycle on the ledger is atomic across concurrent tool calls (e.g. future subagents). - */ -function makeMutex() { - let chain: Promise = Promise.resolve(); - return async function run(fn: () => Promise | T): Promise { - const next = chain.then(() => fn()); - chain = next.catch(() => undefined); - return next; - }; -} - // --------------------------------------------------------------------------- // Server factory // --------------------------------------------------------------------------- diff --git a/src/utils/atomic-ledger.ts b/src/utils/atomic-ledger.ts new file mode 100644 index 00000000..0ae8c832 --- /dev/null +++ b/src/utils/atomic-ledger.ts @@ -0,0 +1,29 @@ +/** + * Small shared primitives for on-disk ledgers: an atomic JSON writer and a + * single-chain async mutex. Used by the audit tools and by the orchestrator + * queue. Lifted here so both share one implementation. + */ +import * as fs from 'fs'; + +/** + * Atomically write JSON: write to a `.tmp` file then rename over the target. The + * rename bumps the file's mtime in one step, which is what a file watcher polls. + */ +export function writeJsonAtomic(targetPath: string, data: unknown): void { + const tmpPath = `${targetPath}.tmp`; + fs.writeFileSync(tmpPath, JSON.stringify(data, null, 2), 'utf8'); + fs.renameSync(tmpPath, targetPath); +} + +/** + * A single async mutex. Serializes read-modify-write cycles so concurrent callers + * (parallel task agents, audit tool calls) never interleave a mutation. + */ +export function makeMutex() { + let chain: Promise = Promise.resolve(); + return async function run(fn: () => Promise | T): Promise { + const next = chain.then(() => fn()); + chain = next.catch(() => undefined); + return next; + }; +} From b20d5b911eabe35ec0996e7a7e54d25c391e32e1 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:56:29 -0400 Subject: [PATCH 03/12] feat(orchestrator): enqueue_task, complete_task, read_handoffs tools with guards (#608) Co-authored-by: Claude Opus 4.8 (1M context) --- .../__tests__/queue-tools.test.ts | 127 +++++++++ src/lib/programs/orchestrator/queue-tools.ts | 259 ++++++++++++++++++ src/lib/wizard-tools.ts | 21 ++ 3 files changed, 407 insertions(+) create mode 100644 src/lib/programs/orchestrator/__tests__/queue-tools.test.ts create mode 100644 src/lib/programs/orchestrator/queue-tools.ts diff --git a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts new file mode 100644 index 00000000..318825d2 --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts @@ -0,0 +1,127 @@ +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { QueueStore } from '@lib/programs/orchestrator/queue'; +import { + applyComplete, + applyEnqueue, + applyReadHandoffs, + checkEnqueueGuards, + type OrchestratorToolsContext, +} from '@lib/programs/orchestrator/queue-tools'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-tools-test-')); +} + +const VALID = ['install', 'init', 'capture']; + +describe('checkEnqueueGuards', () => { + let dir: string; + let store: QueueStore; + let ctx: OrchestratorToolsContext; + + beforeEach(() => { + dir = tmpDir(); + store = new QueueStore(dir, 'run-1'); + ctx = { store, validTypes: VALID }; + }); + + afterEach(() => fs.rmSync(dir, { recursive: true, force: true })); + + it('rejects an unknown type', () => { + const r = checkEnqueueGuards(ctx, { type: 'nope', reason: 'x' }); + expect(r).toMatchObject({ ok: false, guard: 'unknown-type' }); + }); + + it('rejects an unknown dependency', () => { + const r = checkEnqueueGuards(ctx, { + type: 'init', + dependsOn: ['ghost'], + reason: 'x', + }); + expect(r).toMatchObject({ ok: false, guard: 'unknown-dep' }); + }); + + it('trips dedup on the same type and inputs', () => { + store.enqueue({ type: 'install', inputs: { pkg: 'posthog-js' } }); + const r = checkEnqueueGuards(ctx, { + type: 'install', + inputs: { pkg: 'posthog-js' }, + reason: 'x', + }); + expect(r).toMatchObject({ ok: false, guard: 'dedup' }); + }); + + it('allows a valid enqueue', () => { + const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' }); + expect(r).toEqual({ ok: true }); + }); +}); + +describe('apply functions', () => { + let dir: string; + let store: QueueStore; + let ctx: OrchestratorToolsContext; + + beforeEach(() => { + dir = tmpDir(); + store = new QueueStore(dir, 'run-1'); + ctx = { store, validTypes: VALID }; + }); + + afterEach(() => fs.rmSync(dir, { recursive: true, force: true })); + + it('attributes a seed enqueue to the orchestrator', () => { + const r = applyEnqueue(ctx, { type: 'install', reason: 'seed' }); + expect(r.ok).toBe(true); + if (!r.ok) return; + expect(r.task.enqueuedBy).toBe('orchestrator'); + }); + + it('attributes a follow-up enqueue to the running task', () => { + const parent = store.enqueue({ type: 'init' }); + ctx.currentTaskId = parent.id; + const r = applyEnqueue(ctx, { type: 'capture', reason: 'follow-up' }); + expect(r.ok).toBe(true); + if (!r.ok) return; + expect(r.task.enqueuedBy).toBe(parent.id); + }); + + it('complete_task fails when no task is running', () => { + const r = applyComplete(ctx, { + status: 'done', + handoff: { goals: 'g', did: 'd', forNextAgent: 'n' }, + }); + expect(r.ok).toBe(false); + }); + + it('complete_task marks the running task done and stores the handoff', () => { + const t = store.enqueue({ type: 'install' }); + ctx.currentTaskId = t.id; + store.start(t.id); + const r = applyComplete(ctx, { + status: 'done', + handoff: { goals: 'g', did: 'added sdk', forNextAgent: 'env next' }, + }); + expect(r.ok).toBe(true); + expect(store.get(t.id)?.status).toBe('done'); + expect(store.readHandoff(t.id)?.did).toBe('added sdk'); + }); + + it('read_handoffs returns a dependency handoff for the running task', () => { + const dep = store.enqueue({ type: 'install' }); + store.start(dep.id); + store.complete(dep.id, { + goals: 'g', + did: 'installed', + forNextAgent: 'now init', + }); + const t = store.enqueue({ type: 'init', dependsOn: [dep.id] }); + ctx.currentTaskId = t.id; + + const handoffs = applyReadHandoffs(ctx, {}); + expect(handoffs).toHaveLength(1); + expect(handoffs[0].did).toBe('installed'); + }); +}); diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts new file mode 100644 index 00000000..6d3aad52 --- /dev/null +++ b/src/lib/programs/orchestrator/queue-tools.ts @@ -0,0 +1,259 @@ +/** + * Orchestrator MCP tools, registered into the existing `wizard-tools` server when + * a queue is present. They let the orchestrator agent and task agents grow the + * queue, report completion with a structured handoff, and read prior handoffs. + * + * The guard logic and the apply functions are plain, exported, and unit-tested. + * `buildOrchestratorTools` wraps them in the SDK `tool()` shape. + */ +import { z } from 'zod'; +import { analytics } from '../../../utils/analytics'; +import { + TaskStatus, + type QueueStore, + type QueuedTask, + type TaskHandoff, +} from './queue'; + +export interface OrchestratorToolsContext { + store: QueueStore; + /** Task types the registry knows about. enqueue_task rejects anything else. */ + validTypes: readonly string[]; + /** + * The id of the task this tool server is bound to. Each task agent gets its + * own wizard-tools server, so attribution holds when independent tasks run + * in parallel. Absent for the seed, which is not a task. + */ + currentTaskId?: string; +} + +export interface EnqueueArgs { + type: string; + inputs?: Record; + dependsOn?: string[]; + model?: string; + reason: string; +} + +export type GuardResult = + | { ok: true } + | { ok: false; guard: string; message: string }; + +function stableStringify(value: unknown): string { + if (value === null || typeof value !== 'object') return JSON.stringify(value); + if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`; + const entries = Object.entries(value as Record).sort( + ([a], [b]) => a.localeCompare(b), + ); + return `{${entries + .map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`) + .join(',')}}`; +} + +function dedupKey(type: string, inputs: Record): string { + return `${type}::${stableStringify(inputs)}`; +} + +/** + * Validate an enqueue. Structural checks only — a real type, real dependencies, + * and not a literal duplicate. How much runs, and in what shape, is the task + * graph's business, not a knob's. + */ +export function checkEnqueueGuards( + ctx: OrchestratorToolsContext, + args: EnqueueArgs, +): GuardResult { + const tasks = ctx.store.list(); + + if (!ctx.validTypes.includes(args.type)) { + return { + ok: false, + guard: 'unknown-type', + message: `Unknown task type "${ + args.type + }". Valid types: ${ctx.validTypes.join(', ')}.`, + }; + } + + for (const dep of args.dependsOn ?? []) { + if (!ctx.store.get(dep)) { + return { + ok: false, + guard: 'unknown-dep', + message: `Dependency "${dep}" is not a known task id.`, + }; + } + } + + const key = dedupKey(args.type, args.inputs ?? {}); + if ( + tasks.some( + (t) => + t.status !== TaskStatus.Failed && dedupKey(t.type, t.inputs) === key, + ) + ) { + return { + ok: false, + guard: 'dedup', + message: `A "${args.type}" task with these inputs already exists.`, + }; + } + + return { ok: true }; +} + +export type EnqueueResult = + | { ok: true; task: QueuedTask } + | { ok: false; guard: string; message: string }; + +export function applyEnqueue( + ctx: OrchestratorToolsContext, + args: EnqueueArgs, +): EnqueueResult { + const guard = checkEnqueueGuards(ctx, args); + if (!guard.ok) return guard; + + const task = ctx.store.enqueue({ + type: args.type, + inputs: args.inputs ?? {}, + dependsOn: args.dependsOn ?? [], + model: args.model, + enqueuedBy: ctx.currentTaskId ?? 'orchestrator', + }); + return { ok: true, task }; +} + +export type CompleteResult = { ok: true } | { ok: false; message: string }; + +export function applyComplete( + ctx: OrchestratorToolsContext, + args: { status: 'done' | 'failed' | 'skipped'; handoff: TaskHandoff }, +): CompleteResult { + const id = ctx.currentTaskId; + if (!id) { + return { + ok: false, + message: 'complete_task can only be called by a running task agent.', + }; + } + if (args.status === TaskStatus.Failed) { + ctx.store.fail( + id, + { type: 'self-reported', message: args.handoff.forNextAgent }, + args.handoff, + ); + } else if (args.status === TaskStatus.Skipped) { + ctx.store.skip(id, args.handoff); + } else { + ctx.store.complete(id, args.handoff); + } + return { ok: true }; +} + +export function applyReadHandoffs( + ctx: OrchestratorToolsContext, + args: { type?: string; taskId?: string }, +): TaskHandoff[] { + if (args.taskId) { + const h = ctx.store.readHandoff(args.taskId); + return h ? [h] : []; + } + if (args.type) { + return ctx.store.readHandoffsByType(args.type); + } + // No filter: every handoff of a dependency of the current task. + const currentId = ctx.currentTaskId; + const current = currentId ? ctx.store.get(currentId) : undefined; + if (!current) return []; + return current.dependsOn + .map((depId) => ctx.store.readHandoff(depId)) + .filter((h): h is TaskHandoff => h !== null); +} + +const HANDOFF_SHAPE = { + goals: z.string().describe('What this task was asked to achieve.'), + did: z.string().describe('What you actually did.'), + forNextAgent: z.string().describe('What the next agent should know.'), + filesTouched: z.array(z.string()).optional(), +}; + +type SdkTool = ( + name: string, + description: string, + // The SDK accepts a plain object of zod fields as the schema. + schema: Record, + handler: (args: never) => unknown, +) => unknown; + +function textResult(text: string, isError = false) { + return { isError, content: [{ type: 'text' as const, text }] }; +} + +/** + * Build the orchestrator tools in the SDK `tool()` shape. Called from + * createWizardToolsServer only when a queue context is present. + */ +export function buildOrchestratorTools( + tool: SdkTool, + ctx: OrchestratorToolsContext, +): unknown[] { + const enqueueTask = tool( + 'enqueue_task', + 'Add a task to the orchestrator queue. Use it to seed work and to enqueue follow-up work you discover. Keep tasks small and discrete.', + { + type: z + .string() + .describe(`The task type. One of: ${ctx.validTypes.join(', ')}.`), + inputs: z.record(z.unknown()).optional(), + dependsOn: z + .array(z.string()) + .optional() + .describe('Task ids that must be done before this task runs.'), + model: z.string().optional(), + reason: z.string().describe('One line on why this task is needed.'), + }, + ((args: EnqueueArgs) => { + const res = applyEnqueue(ctx, args); + if (!res.ok) { + analytics.wizardCapture('orchestrator guard tripped', { + guard: res.guard, + type: args.type, + }); + return textResult(res.message, true); + } + return textResult(JSON.stringify({ id: res.task.id })); + }) as (args: never) => unknown, + ); + + const completeTask = tool( + 'complete_task', + "Report the outcome of your task. Always call this exactly once when you finish, with a structured handoff for the next agent. Use status 'skipped' when the task does not apply to this project and you cannot do it (say why in the handoff) — not 'done'.", + { + status: z.enum(['done', 'failed', 'skipped']), + handoff: z.object(HANDOFF_SHAPE), + }, + ((args: { + status: 'done' | 'failed' | 'skipped'; + handoff: TaskHandoff; + }) => { + const res = applyComplete(ctx, args); + if (!res.ok) return textResult(res.message, true); + return textResult('ok'); + }) as (args: never) => unknown, + ); + + const readHandoffs = tool( + 'read_handoffs', + 'Read structured handoffs from earlier tasks. With no argument, returns the handoffs of your dependencies.', + { + type: z.string().optional(), + taskId: z.string().optional(), + }, + ((args: { type?: string; taskId?: string }) => { + const handoffs = applyReadHandoffs(ctx, args); + return textResult(JSON.stringify(handoffs, null, 2)); + }) as (args: never) => unknown, + ); + + return [enqueueTask, completeTask, readHandoffs]; +} diff --git a/src/lib/wizard-tools.ts b/src/lib/wizard-tools.ts index 8d2f8d37..0f7e6cc5 100644 --- a/src/lib/wizard-tools.ts +++ b/src/lib/wizard-tools.ts @@ -26,6 +26,10 @@ import { } from './programs/audit/types'; import type { WizardAskBridge } from './wizard-ask-bridge'; import { createSecretVault, type SecretVault } from './secret-vault'; +import { + buildOrchestratorTools, + type OrchestratorToolsContext, +} from './programs/orchestrator/queue-tools'; // --------------------------------------------------------------------------- // SDK dynamic import (ESM module loaded once, cached) @@ -224,6 +228,14 @@ export interface WizardToolsOptions { * (e.g. in unit tests), a fresh vault is created internally. */ secretVault?: SecretVault; + + /** + * Orchestrator queue context. Present only when the `wizard-orchestrator` + * flag routes the run to the orchestrator; when set, the orchestrator tools + * (enqueue_task, complete_task, read_handoffs) are registered. Absent on the + * linear path. + */ + orchestrator?: OrchestratorToolsContext; } /** Default per-run cap on wizard_ask calls when no override is provided. */ @@ -509,6 +521,7 @@ export async function createWizardToolsServer(options: WizardToolsOptions) { askBridge, askMaxQuestions = DEFAULT_ASK_MAX_QUESTIONS, secretVault = createSecretVault(), + orchestrator, } = options; const sdk = await getSDKModule(); const { tool, createSdkMcpServer } = sdk; @@ -1108,6 +1121,10 @@ export async function createWizardToolsServer(options: WizardToolsOptions) { // -- Assemble server ------------------------------------------------------ + const orchestratorTools = orchestrator + ? buildOrchestratorTools(tool, orchestrator) + : []; + return createSdkMcpServer({ name: SERVER_NAME, version: '1.0.0', @@ -1121,6 +1138,7 @@ export async function createWizardToolsServer(options: WizardToolsOptions) { auditAddChecks, auditResolveChecks, wizardAsk, + ...orchestratorTools, ], }); } @@ -1140,6 +1158,9 @@ export const WIZARD_TOOL_NAMES = { auditAddChecks: `mcp__${SERVER_NAME}__audit_add_checks`, auditResolveChecks: `mcp__${SERVER_NAME}__audit_resolve_checks`, wizardAsk: `mcp__${SERVER_NAME}__wizard_ask`, + enqueueTask: `mcp__${SERVER_NAME}__enqueue_task`, + completeTask: `mcp__${SERVER_NAME}__complete_task`, + readHandoffs: `mcp__${SERVER_NAME}__read_handoffs`, } as const; // --------------------------------------------------------------------------- From 56794568597f5669bce898b841819ebb912b2e20 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:57:23 -0400 Subject: [PATCH 04/12] feat(orchestrator): executor drain-loop scheduler (#609) Co-authored-by: Claude Opus 4.8 (1M context) --- .../orchestrator/__tests__/executor.test.ts | 150 ++++++++++++++++++ src/lib/programs/orchestrator/executor.ts | 115 ++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 src/lib/programs/orchestrator/__tests__/executor.test.ts create mode 100644 src/lib/programs/orchestrator/executor.ts diff --git a/src/lib/programs/orchestrator/__tests__/executor.test.ts b/src/lib/programs/orchestrator/__tests__/executor.test.ts new file mode 100644 index 00000000..5665b9b2 --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/executor.test.ts @@ -0,0 +1,150 @@ +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { + QueueStore, + type QueuedTask, + type TaskHandoff, +} from '@lib/programs/orchestrator/queue'; +import { drainQueue, type RunTask } from '@lib/programs/orchestrator/executor'; + +jest.mock('@utils/analytics', () => ({ + analytics: { captureException: jest.fn(), wizardCapture: jest.fn() }, +})); +import { analytics } from '@utils/analytics'; + +const HANDOFF: TaskHandoff = { goals: 'g', did: 'd', forNextAgent: 'n' }; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'executor-test-')); +} + +describe('drainQueue', () => { + let dir: string; + let q: QueueStore; + + beforeEach(() => { + dir = tmpDir(); + q = new QueueStore(dir, 'run-1'); + }); + + afterEach(() => fs.rmSync(dir, { recursive: true, force: true })); + + const completing: RunTask = (task) => { + q.complete(task.id, HANDOFF); + return Promise.resolve(); + }; + + it('runs a single task to done and drains', async () => { + const a = q.enqueue({ type: 'install' }); + await drainQueue(q, completing, { maxStarts: 50 }); + expect(q.get(a.id)?.status).toBe('done'); + expect(q.isDrained()).toBe(true); + }); + + it('runs a dependent task only after its dependency completes', async () => { + const order: string[] = []; + const a = q.enqueue({ type: 'install' }); + const b = q.enqueue({ type: 'init', dependsOn: [a.id] }); + const runner: RunTask = (task) => { + order.push(task.type); + q.complete(task.id, HANDOFF); + return Promise.resolve(); + }; + await drainQueue(q, runner, { maxStarts: 50 }); + expect(order).toEqual(['install', 'init']); + expect(q.get(b.id)?.status).toBe('done'); + }); + + it('runs independent branches concurrently; the graph is the only schedule', async () => { + let active = 0; + let maxActive = 0; + const runner: RunTask = async (task) => { + active += 1; + maxActive = Math.max(maxActive, active); + await new Promise((r) => setTimeout(r, 5)); + q.complete(task.id, HANDOFF); + active -= 1; + }; + const a = q.enqueue({ type: 'install' }); + const b = q.enqueue({ type: 'init' }); + q.enqueue({ type: 'capture', dependsOn: [a.id, b.id] }); + await drainQueue(q, runner, { maxStarts: 50 }); + // install and init overlap; capture waits for both. + expect(maxActive).toBe(2); + expect(q.summary().done).toBe(3); + }); + + it('starts a dependent the moment its dependency finishes, not in waves', async () => { + const startedAt: Record = {}; + let clock = 0; + const runner: RunTask = async (task) => { + startedAt[task.type] = clock++; + // slow holds the wave open; fast finishes early and unblocks after-fast. + const delay = task.type === 'slow' ? 30 : 5; + await new Promise((r) => setTimeout(r, delay)); + q.complete(task.id, HANDOFF); + }; + q.enqueue({ type: 'slow' }); + const fast = q.enqueue({ type: 'fast' }); + q.enqueue({ type: 'after-fast', dependsOn: [fast.id] }); + await drainQueue(q, runner, { maxStarts: 50 }); + // after-fast started while slow was still running. + expect(startedAt['after-fast']).toBeDefined(); + expect(q.summary().done).toBe(3); + }); + + it('retries a task that ends without reporting, then fails it', async () => { + const a = q.enqueue({ type: 'install', maxAttempts: 2 }); + const noReport: RunTask = async () => { + /* agent never calls complete_task */ + }; + await drainQueue(q, noReport, { maxStarts: 50 }); + expect(q.get(a.id)?.status).toBe('failed'); + expect(q.get(a.id)?.attempts).toBe(2); + }); + + it('succeeds on a retry within the attempt budget', async () => { + let calls = 0; + const a = q.enqueue({ type: 'install', maxAttempts: 3 }); + const flaky: RunTask = (task: QueuedTask) => { + calls += 1; + if (calls >= 2) q.complete(task.id, HANDOFF); + return Promise.resolve(); + }; + await drainQueue(q, flaky, { maxStarts: 50 }); + expect(q.get(a.id)?.status).toBe('done'); + expect(calls).toBe(2); + }); + + it('captures and fails a task whose runner throws', async () => { + const a = q.enqueue({ type: 'install', maxAttempts: 1 }); + const throwing: RunTask = () => Promise.reject(new Error('agent exploded')); + await drainQueue(q, throwing, { maxStarts: 50 }); + expect(q.get(a.id)?.status).toBe('failed'); + expect(analytics.captureException).toHaveBeenCalled(); + }); + + it('does not run a task whose dependency failed', async () => { + const a = q.enqueue({ type: 'install', maxAttempts: 1 }); + const b = q.enqueue({ type: 'init', dependsOn: [a.id] }); + const runner: RunTask = (task) => { + if (task.type === 'init') q.complete(task.id, HANDOFF); + // install never reports, so it fails after its single attempt. + return Promise.resolve(); + }; + await drainQueue(q, runner, { maxStarts: 50 }); + expect(q.get(a.id)?.status).toBe('failed'); + expect(q.get(b.id)?.status).toBe('pending'); + expect(q.isDrained()).toBe(true); + }); + + it('terminates via the start backstop instead of looping forever', async () => { + const a = q.enqueue({ type: 'install', maxAttempts: 999 }); + const neverReports: RunTask = async () => { + /* would retry forever without the backstop */ + }; + await drainQueue(q, neverReports, { maxStarts: 3 }); + expect(q.get(a.id)?.attempts).toBeLessThanOrEqual(3); + }); +}); diff --git a/src/lib/programs/orchestrator/executor.ts b/src/lib/programs/orchestrator/executor.ts new file mode 100644 index 00000000..abf0ed15 --- /dev/null +++ b/src/lib/programs/orchestrator/executor.ts @@ -0,0 +1,115 @@ +/** + * The executor drains the queue. It starts every runnable task (dependencies + * satisfied) as soon as it becomes runnable — parallelism is decided by the + * task graph, not by an executor knob. Each task runs through an injected + * `runTask` function and reports its outcome via `complete_task`; a task that + * ends without reporting is retried while attempts remain, then failed. A + * `maxStarts` backstop guarantees termination. + * + * The drain loop is independent of how a task actually runs. `runTask` is + * injected: the real one spins up a fresh agent, the tests use a fake. + */ +import { analytics } from '../../../utils/analytics'; +import { logToFile } from '../../../utils/debug'; +import { TaskStatus, type QueueStore, type QueuedTask } from './queue'; + +/** Per-task agent configuration the resolver produces from a task's type. */ +export interface ResolvedTask { + model: string; + allowedTools: readonly string[]; + disallowedTools: readonly string[]; + /** Mini-skills to install before the task runs (the HOW). */ + skills: readonly string[]; + prompt: string; +} + +/** Resolves a queued task to what the agent needs. The real one is markdown-backed. */ +export type TaskResolver = ( + task: QueuedTask, + store: QueueStore, +) => ResolvedTask; + +/** Runs one task's agent. It is expected to drive the task to a terminal state + * (via the task agent calling complete_task). */ +export type RunTask = (task: QueuedTask) => Promise; + +export interface DrainOptions { + /** Backstop against a pathological always-one-more-pending loop. */ + maxStarts: number; +} + +export const DEFAULT_DRAIN_OPTIONS: DrainOptions = { + maxStarts: 200, +}; + +async function runOne( + store: QueueStore, + runTask: RunTask, + task: QueuedTask, +): Promise { + store.start(task.id); + try { + await runTask(task); + } catch (error) { + // The task threw rather than reporting. The outcome check below handles + // the queue; the exception itself should never be silent. + logToFile(`[executor] runTask threw for ${task.type}:`, error); + analytics.captureException( + error instanceof Error ? error : new Error(String(error)), + { step: 'orchestrator_run_task', task_type: task.type }, + ); + } + + const after = store.get(task.id); + if (!after) return; + + if (after.status === TaskStatus.Running) { + // The agent ended without calling complete_task. Retry or fail. + if (after.attempts < after.maxAttempts) { + store.requeue(task.id); + } else { + store.fail(task.id, { + type: 'no-report', + message: 'Task ended without calling complete_task.', + }); + } + return; + } + + if ( + after.status === TaskStatus.Failed && + after.attempts < after.maxAttempts + ) { + store.requeue(task.id); + } +} + +/** + * Drain the queue to a terminal state. Every runnable task starts the moment + * its dependencies finish; independent branches run concurrently. Returns when + * every task is done, failed, or blocked by a failed dependency, or when the + * start backstop trips. + */ +export async function drainQueue( + store: QueueStore, + runTask: RunTask, + opts: DrainOptions = DEFAULT_DRAIN_OPTIONS, +): Promise { + const running = new Map>(); + let starts = 0; + + for (;;) { + for (const task of store.nextRunnable()) { + if (++starts > opts.maxStarts) break; + // runOne marks the task in_progress synchronously, so the next + // nextRunnable() call no longer offers it. + const p = runOne(store, runTask, task).finally(() => + running.delete(task.id), + ); + running.set(task.id, p); + } + if (running.size === 0) break; + // Wake on the first finish; it may have unblocked dependents or requeued. + await Promise.race(running.values()); + } +} From 6c2318d7ff80e8f19164cbf93e8f1fcda062a481 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:18:49 -0400 Subject: [PATCH 05/12] feat(orchestrator): markdown-backed agent loader + full integration flow (#619) Co-authored-by: Claude Opus 4.8 (1M context) --- src/lib/agent/agent-interface.ts | 31 +- src/lib/agent/agent-runner.ts | 17 +- .../__tests__/agent-prompt-loader.test.ts | 205 ++++++++++++ .../orchestrator/agent-prompt-loader.ts | 310 ++++++++++++++++++ .../orchestrator/orchestrator-runner.ts | 296 +++++++++++++++++ src/lib/programs/orchestrator/queue-tools.ts | 14 + src/lib/programs/orchestrator/queue.ts | 6 + src/lib/task-stream/task-stream-push.ts | 2 + src/ui/logging-ui.ts | 18 +- src/ui/tui/primitives/ProgressList.tsx | 28 +- src/ui/wizard-ui.ts | 1 + 11 files changed, 902 insertions(+), 26 deletions(-) create mode 100644 src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts create mode 100644 src/lib/programs/orchestrator/agent-prompt-loader.ts create mode 100644 src/lib/programs/orchestrator/orchestrator-runner.ts diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts index 46f375a2..07342385 100644 --- a/src/lib/agent/agent-interface.ts +++ b/src/lib/agent/agent-interface.ts @@ -147,6 +147,12 @@ export type AgentConfig = { getPendingQuestion?: () => | import('@lib/wizard-session').PendingQuestion | null; + /** + * Orchestrator queue context. Present only when the `wizard-orchestrator` + * flag routes the run here; threaded into wizard-tools so the orchestrator + * tools register. + */ + orchestrator?: import('@lib/programs/orchestrator/queue-tools').OrchestratorToolsContext; }; /** @@ -168,6 +174,7 @@ export type StopHookResult = export function createStopHook( featureQueue: readonly AdditionalFeature[], signals?: AgentOutputSignals, + requestRemark = true, ): (input: { stop_hook_active: boolean }) => StopHookResult { let featureIndex = 0; let remarkRequested = false; @@ -195,8 +202,9 @@ export function createStopHook( return { decision: 'block', reason: prompt }; } - // Phase 2: collect remark (once) - if (!remarkRequested) { + // Phase 2: collect remark (once). Skipped when the caller opts out — the + // orchestrator suppresses it per task so it does not fire on every agent. + if (requestRemark && !remarkRequested) { remarkRequested = true; logToFile('Stop hook: requesting reflection'); return { @@ -537,8 +545,6 @@ export async function initializeAgent( logToFile('Agent initialization starting'); logToFile('Install directory:', options.installDir); - getUI().log.step('Initializing Claude agent...'); - try { // Configure LLM gateway environment variables (inherited by SDK subprocess) const gatewayUrl = getLlmGatewayUrlFromHost(config.posthogApiHost); @@ -590,6 +596,7 @@ export async function initializeAgent( skillsBaseUrl: config.skillsBaseUrl, askBridge: config.askBridge, askMaxQuestions: config.askMaxQuestions, + orchestrator: config.orchestrator, }); mcpServers['wizard-tools'] = wizardToolsServer; @@ -624,8 +631,6 @@ export async function initializeAgent( }); } - getUI().log.step(`Verbose logs: ${getLogFilePath()}`); - getUI().log.success("Agent initialized. Let's get cooking!"); return agentRunConfig; } catch (error) { getUI().log.error( @@ -671,6 +676,8 @@ export async function runAgent( errorMessage?: string; additionalFeatureQueue?: readonly AdditionalFeature[]; abortCases?: readonly AbortCaseMatcher[]; + /** Request the end-of-run reflection remark. Defaults to true. */ + requestRemark?: boolean; }, middleware?: { onMessage(message: any): void; @@ -930,7 +937,11 @@ export async function runAgent( Stop: [ { hooks: [ - createStopHook(config?.additionalFeatureQueue ?? [], signals), + createStopHook( + config?.additionalFeatureQueue ?? [], + signals, + config?.requestRemark ?? true, + ), ], timeout: 30, }, @@ -978,6 +989,7 @@ export async function runAgent( signals, receivedSuccessResult, tasks, + isOrchestratorEnabled(agentConfig.wizardFlags ?? {}), ); // [ABORT] detection: the skill emits "[ABORT] " when it @@ -1327,6 +1339,9 @@ function handleSDKMessage( signals: AgentOutputSignals, receivedSuccessResult = false, tasks?: Map, + // The orchestrator owns the TUI task panel (it renders its queue). Suppress the + // agent's own TaskCreate/TaskUpdate rendering so it does not clobber the queue. + suppressTaskRender = false, ): void { // Map preserves insertion order (the order the agent created the tasks). // Within that, group by status: completed first, then in_progress, then @@ -1338,7 +1353,7 @@ function handleSDKMessage( }; const rank = (status: string): number => STATUS_RANK[status] ?? 2; const syncTasks = (): void => { - if (!tasks) return; + if (!tasks || suppressTaskRender) return; const sorted = Array.from(tasks.values()).sort( (a, b) => rank(a.status) - rank(b.status), ); diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 77c2f9ce..07a89c0c 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -31,12 +31,14 @@ import { AgentErrorType, AgentSignals, buildWizardMetadata, + isOrchestratorEnabled, } from './agent-interface'; import { checkAllSettingsConflicts, backupAndFixClaudeSettings, restoreClaudeSettings, } from './claude-settings'; +import { runOrchestrator } from '../programs/orchestrator/orchestrator-runner'; import { getCloudUrlFromRegion } from '@utils/urls'; import { evaluateWizardReadiness, @@ -45,7 +47,12 @@ import { getBlockingServiceKeys, SERVICE_LABELS, } from '@lib/health-checks/readiness'; -import { enableDebugLogs, initLogFile, logToFile } from '@utils/debug'; +import { + enableDebugLogs, + getLogFilePath, + initLogFile, + logToFile, +} from '@utils/debug'; import { createBenchmarkPipeline } from '@lib/middleware/benchmark'; import { wizardAbort, WizardError, registerCleanup } from '@utils/wizard-abort'; import { formatScanReport, writeScanReport } from '@lib/yara-hooks'; @@ -209,6 +216,11 @@ export async function runProgram( ): Promise { const boot = await bootstrapProgram(session, config, programConfig); + if (isOrchestratorEnabled(boot.wizardFlags)) { + getUI().log.info('Task-queue orchestrator enabled.'); + return runOrchestrator(session, programConfig, boot); + } + return runLinearProgram(session, config, programConfig, boot); } @@ -445,6 +457,7 @@ async function runLinearProgram( timeoutMs: config.askTimeoutMs, }); + getUI().log.step('Initializing Claude agent...'); const agent = await initializeAgent( { workingDirectory: session.installDir, @@ -466,6 +479,8 @@ async function runLinearProgram( }, sessionToOptions(session), ); + getUI().log.step(`Verbose logs: ${getLogFilePath()}`); + getUI().log.success("Agent initialized. Let's get cooking!"); logToFile('[agent-runner] agent initialized'); diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts new file mode 100644 index 00000000..64a4bdab --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts @@ -0,0 +1,205 @@ +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { + agentRunTools, + buildRegistry, + parseAgentPrompt, + resolveTask, + type AgentPrompt, + type AgentRegistry, +} from '../agent-prompt-loader'; +import { QueueStore } from '../queue'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-')); +} + +function registryOf(prompts: AgentPrompt[]): AgentRegistry { + return buildRegistry( + prompts.map((p) => ({ ...p, flow: 'test-flow' })), + 'test-flow', + ); +} + +describe('parseAgentPrompt', () => { + const sample = `--- +type: instrument-events +model: claude-sonnet-4-6 # cheapest model that succeeds +skills: [instrument-events] +allowedTools: [Read, Edit, Grep, Glob, Bash] +disallowedTools: [enqueue_task] +dependsOn: [init] +--- + +## Goal +Add at least one capture call. +`; + + it('parses frontmatter scalars and inline arrays', () => { + const p = parseAgentPrompt(sample, 'fallback'); + expect(p.type).toBe('instrument-events'); + expect(p.model).toBe('claude-sonnet-4-6'); + expect(p.skills).toEqual(['instrument-events']); + expect(p.allowedTools).toEqual(['Read', 'Edit', 'Grep', 'Glob', 'Bash']); + expect(p.disallowedTools).toEqual(['enqueue_task']); + expect(p.dependsOn).toEqual(['init']); + }); + + it('strips inline comments and keeps the body', () => { + const p = parseAgentPrompt(sample, 'fallback'); + expect(p.model).not.toContain('#'); + expect(p.body).toContain('## Goal'); + expect(p.body).not.toContain('---'); + }); + + it('falls back to the menu id when type is omitted', () => { + const p = parseAgentPrompt('---\nmodel: x\n---\nbody', 'install'); + expect(p.type).toBe('install'); + }); + + it('parses the flow from frontmatter', () => { + const p = parseAgentPrompt('---\nflow: audit\n---\nx', 'fix-events'); + expect(p.flow).toBe('audit'); + }); + + it('marks the seed from frontmatter; everything else is a task', () => { + expect(parseAgentPrompt('---\nseed: true\n---\nplan', 'planner').seed).toBe( + true, + ); + expect(parseAgentPrompt('---\nmodel: x\n---\nbody', 'install').seed).toBe( + false, + ); + }); + + it('defaults missing array fields to empty and model to undefined', () => { + const p = parseAgentPrompt('no frontmatter at all', 'stub'); + expect(p.model).toBeUndefined(); + expect(p.skills).toEqual([]); + expect(p.dependsOn).toEqual([]); + expect(p.body).toBe('no frontmatter at all'); + }); +}); + +describe('agentRunTools', () => { + it('MCP-qualifies orchestrator tools and passes native tools through', () => { + const p = parseAgentPrompt( + '---\nallowedTools: [Read, read_handoffs]\ndisallowedTools: [enqueue_task, complete_task, Bash]\n---\nx', + 't', + ); + const { allowedTools, disallowedTools } = agentRunTools(p); + expect(allowedTools).toEqual([ + 'Read', + 'mcp__posthog-wizard__read_handoffs', + ]); + expect(disallowedTools).toEqual([ + 'mcp__posthog-wizard__enqueue_task', + 'mcp__posthog-wizard__complete_task', + 'Bash', + ]); + }); +}); + +describe('buildRegistry', () => { + const prompt = (over: Partial): AgentPrompt => ({ + type: 'x', + seed: false, + skills: [], + allowedTools: [], + disallowedTools: [], + dependsOn: [], + body: 'b', + ...over, + }); + + it('scopes to one flow and keeps the seed out of the task types', () => { + const registry = buildRegistry( + [ + prompt({ type: 'plan-audit', flow: 'audit', seed: true }), + prompt({ type: 'fix-events', flow: 'audit' }), + prompt({ type: 'install', flow: 'posthog-integration' }), + prompt({ type: 'example' }), + ], + 'audit', + ); + expect(registry.types).toEqual(['fix-events']); + expect(registry.seed?.type).toBe('plan-audit'); + expect(registry.get('install')).toBeUndefined(); + // A flowless prompt (e.g. the documentation example) joins no registry. + expect(registry.get('example')).toBeUndefined(); + }); +}); + +describe('resolveTask', () => { + let dir: string; + let store: QueueStore; + + beforeEach(() => { + dir = tmpDir(); + store = new QueueStore(dir, 'run-1'); + }); + + afterEach(() => { + fs.rmSync(dir, { recursive: true, force: true }); + }); + + const prompt: AgentPrompt = { + type: 'capture', + seed: false, + model: 'claude-haiku-4-5-20251001', + skills: ['instrument-events'], + allowedTools: ['Read', 'Edit'], + disallowedTools: ['enqueue_task'], + dependsOn: ['plan-capture'], + body: '## Goal\nInstrument the planned events.', + }; + + it('throws when no prompt is registered for the type', () => { + const registry = registryOf([]); + const task = { type: 'capture', dependsOn: [] } as never; + expect(() => resolveTask(registry, task, store)).toThrow(/capture/); + }); + + it('resolves model, tools, and skills from the prompt', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture' }); + const resolved = resolveTask(registry, task, store); + expect(resolved.model).toBe('claude-haiku-4-5-20251001'); + expect(resolved.skills).toEqual(['instrument-events']); + expect(resolved.disallowedTools).toEqual([ + 'mcp__posthog-wizard__enqueue_task', + ]); + }); + + it('prefers the enqueue model override over the prompt model', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture', model: 'override-x' }); + expect(resolveTask(registry, task, store).model).toBe('override-x'); + }); + + it("appends upstream dependencies' handoffs as context", () => { + const registry = registryOf([prompt]); + const dep = store.enqueue({ type: 'plan-capture' }); + store.complete(dep.id, { + goals: 'decide events', + did: 'picked signup and purchase', + forNextAgent: 'instrument those two', + }); + const task = store.enqueue({ + type: 'capture', + dependsOn: [dep.id], + }); + const resolved = resolveTask(registry, task, store); + expect(resolved.prompt).toContain('Context from previous steps'); + expect(resolved.prompt).toContain('picked signup and purchase'); + expect(resolved.prompt).toContain('instrument those two'); + }); + + it('omits the context section when there are no handoffs', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture' }); + expect(resolveTask(registry, task, store).prompt).not.toContain( + 'Context from previous steps', + ); + }); +}); diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts new file mode 100644 index 00000000..ee351db8 --- /dev/null +++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts @@ -0,0 +1,310 @@ +/** + * Agent-prompt loader + registry. + * + * Agent prompts are the WHAT of a task: a markdown file per type, served from + * context-mill as the `agents` content type (parallel to skills). The frontmatter + * carries the artifacts the executor needs — model, the mini-skills to load (the + * HOW), the tools the task may use, and its dependencies — and the body is the + * instruction the agent reads. + * + * The registry is fetched once at startup and scoped to one flow — agents + * declare `flow` and (for the planner) `seed: true` in frontmatter, so each + * program (integration, audit, migration, ...) ships its own agent set and the + * loader stays generic. Every prompt is downloaded and parsed up front, so + * resolving a task to its run config is synchronous and adds no mid-drain + * network latency. The registry's type list also drives `enqueue_task` + * validation. + */ +import type { QueueStore, QueuedTask } from './queue'; +import type { ResolvedTask } from './executor'; + +/** + * The basics the client injects around every agent-prompt body. The `/agents/` + * files carry intent only (goal, success criteria); the wizard owns the I/O + * contract — who the agent is, how it reports, how it surfaces progress — so the + * authored prompts never restate it. + */ +export interface OrchestratorPromptContext { + projectId: number; + projectApiKey: string; + host: string; + /** Path to the framework's reference implementation (EXAMPLE.md), if available. */ + examplePath?: string; + /** Path to the framework's rules (COMMANDMENTS.md), if available. */ + commandmentsPath?: string; +} + +function projectContext(ctx: OrchestratorPromptContext): string { + return `You have access to the PostHog MCP server and the wizard tools. + +Project context: +- PostHog Project ID: ${ctx.projectId} +- PostHog public token: ${ctx.projectApiKey} +- PostHog Host: ${ctx.host}`; +} + +/** Points the agent at the framework's reference integration to learn patterns from. */ +function exampleReference(ctx: OrchestratorPromptContext): string | null { + if (!ctx.examplePath) return null; + return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`; +} + +/** The framework's rules ship with the reference skill; every task follows them. */ +function commandmentsReference(ctx: OrchestratorPromptContext): string | null { + if (!ctx.commandmentsPath) return null; + return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; +} + +const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; + +const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; + +/** A task agent's full prompt: injected basics, then the authored intent. */ +export function assembleTaskPrompt( + ctx: OrchestratorPromptContext, + body: string, +): string { + return [ + projectContext(ctx), + exampleReference(ctx), + commandmentsReference(ctx), + TASK_BASICS, + body, + ] + .filter(Boolean) + .join('\n\n'); +} + +/** The seed agent's full prompt: injected basics, then the authored intent. */ +export function assembleSeedPrompt( + ctx: OrchestratorPromptContext, + body: string, +): string { + return [projectContext(ctx), SEED_BASICS, body].join('\n\n'); +} + +/** Used when neither the enqueue call nor the prompt frontmatter names a model. */ +const DEFAULT_TASK_MODEL = 'claude-sonnet-4-6'; + +/** Orchestrator tools are MCP tools under the `posthog-wizard` server. Frontmatter + * names them short (e.g. `enqueue_task`); the SDK gates on the full name. */ +const ORCHESTRATOR_TOOL_PREFIX = 'mcp__posthog-wizard__'; +const ORCHESTRATOR_TOOLS = new Set([ + 'enqueue_task', + 'complete_task', + 'read_handoffs', +]); + +/** A parsed agent prompt. The frontmatter fields plus the markdown body. */ +export interface AgentPrompt { + type: string; + /** Human-readable title for the TUI; falls back to `type` when absent. */ + label?: string; + /** The flow this agent belongs to (the program id, e.g. \`posthog-integration\`). */ + flow?: string; + /** Marks the flow's planner: it seeds the queue and is not an enqueueable task. */ + seed: boolean; + model?: string; + skills: string[]; + allowedTools: string[]; + disallowedTools: string[]; + dependsOn: string[]; + body: string; +} + +export interface AgentRegistry { + /** The flow's enqueueable task types — every prompt except the seed. */ + readonly types: string[]; + /** The flow's planner, the one prompt marked `seed: true` in its frontmatter. */ + readonly seed?: AgentPrompt; + get(type: string): AgentPrompt | undefined; +} + +/** The registry for one flow's prompts. Pure; the loader feeds it the fetched set. */ +export function buildRegistry( + prompts: readonly AgentPrompt[], + flow: string, +): AgentRegistry { + const inFlow = prompts.filter((p) => p.flow === flow); + const byType = new Map(inFlow.map((p) => [p.type, p])); + return { + types: inFlow.filter((p) => !p.seed).map((p) => p.type), + seed: inFlow.find((p) => p.seed), + get: (type) => byType.get(type), + }; +} + +interface AgentMenu { + agents: { id: string; downloadUrl: string }[]; +} + +/** A native tool passes through; an orchestrator tool gets its MCP-qualified name. */ +function expandToolName(name: string): string { + return ORCHESTRATOR_TOOLS.has(name) + ? `${ORCHESTRATOR_TOOL_PREFIX}${name}` + : name; +} + +/** A prompt's allow/disallow lists with orchestrator tool names MCP-qualified. */ +export function agentRunTools(prompt: AgentPrompt): { + allowedTools: string[]; + disallowedTools: string[]; +} { + return { + allowedTools: prompt.allowedTools.map(expandToolName), + disallowedTools: prompt.disallowedTools.map(expandToolName), + }; +} + +function toStringArray(value: unknown): string[] { + if (!Array.isArray(value)) return []; + return value.filter((v): v is string => typeof v === 'string'); +} + +/** + * Parse the leading `---` frontmatter block and the markdown body. The + * frontmatter is a small, known schema (scalars and inline `[a, b]` arrays), so + * a tiny parser covers it without a YAML dependency. Inline `# comments` after a + * value are stripped. `fallbackType` is the menu id, used when the body omits + * `type:`. + */ +export function parseAgentPrompt( + text: string, + fallbackType: string, +): AgentPrompt { + const match = text.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?([\s\S]*)$/); + const frontmatter = match ? match[1] : ''; + const body = (match ? match[2] : text).trim(); + + const fields: Record = {}; + for (const rawLine of frontmatter.split(/\r?\n/)) { + const line = rawLine.replace(/\s+#.*$/, '').trim(); + if (!line || line.startsWith('#')) continue; + const kv = line.match(/^([\w-]+):\s*(.*)$/); + if (!kv) continue; + const [, key, raw] = kv; + if (raw.startsWith('[') && raw.endsWith(']')) { + fields[key] = raw + .slice(1, -1) + .split(',') + .map((s) => s.trim().replace(/^['"]|['"]$/g, '')) + .filter(Boolean); + } else { + fields[key] = raw.replace(/^['"]|['"]$/g, ''); + } + } + + const model = typeof fields.model === 'string' ? fields.model : undefined; + return { + type: typeof fields.type === 'string' ? fields.type : fallbackType, + label: typeof fields.label === 'string' ? fields.label : undefined, + flow: typeof fields.flow === 'string' ? fields.flow : undefined, + seed: fields.seed === 'true', + model, + skills: toStringArray(fields.skills), + allowedTools: toStringArray(fields.allowedTools), + disallowedTools: toStringArray(fields.disallowedTools), + dependsOn: toStringArray(fields.dependsOn), + body, + }; +} + +async function fetchText(url: string): Promise { + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Fetch ${url} failed: ${res.status} ${res.statusText}`); + } + return res.text(); +} + +/** + * Fetch the agent menu and every agent prompt it lists, parse them, and build + * the registry for one flow. Throws if the menu cannot be fetched — the + * orchestrator cannot run without its prompts. + */ +export async function loadAgentRegistry( + skillsBaseUrl: string, + flow: string, +): Promise { + const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`); + const menu = JSON.parse(menuRaw) as AgentMenu; + + const prompts = await Promise.all( + (menu.agents ?? []).map(async (entry) => { + const text = await fetchText(entry.downloadUrl); + return parseAgentPrompt(text, entry.id); + }), + ); + + return buildRegistry(prompts, flow); +} + +/** + * Render a task's own inputs into a section, so a fanned-out task (e.g. one + * `capture` per event) sees the specific thing it owns. Empty when there are none. + */ +function renderInputs(task: QueuedTask): string { + const entries = Object.entries(task.inputs ?? {}); + if (entries.length === 0) return ''; + const lines = entries.map(([k, v]) => `- ${k}: ${formatInputValue(v)}`); + return `## Your task input\n\n${lines.join('\n')}`; +} + +function formatInputValue(value: unknown): string { + if (typeof value === 'string') return value; + return JSON.stringify(value); +} + +/** + * Render the handoffs of a task's completed dependencies into a context section, + * so a fresh agent sees what the upstream steps did. Empty when there are none. + */ +function renderHandoffContext(task: QueuedTask, store: QueueStore): string { + const lines: string[] = []; + for (const depId of task.dependsOn) { + const dep = store.get(depId); + const handoff = store.readHandoff(depId); + if (!dep || !handoff) continue; + lines.push(`### ${dep.type}`); + lines.push(`- did: ${handoff.did}`); + lines.push(`- for you: ${handoff.forNextAgent}`); + if (handoff.filesTouched?.length) { + lines.push(`- files: ${handoff.filesTouched.join(', ')}`); + } + lines.push(''); + } + if (lines.length === 0) return ''; + return `## Context from previous steps\n\n${lines.join('\n')}`.trim(); +} + +/** + * Resolve a queued task to its run config: the prompt body (with upstream + * handoffs appended), the model, and the tool lists with orchestrator tool names + * MCP-qualified. The model precedence is enqueue override, then prompt, then + * default. Throws if no prompt is registered for the task's type. + */ +export function resolveTask( + registry: AgentRegistry, + task: QueuedTask, + store: QueueStore, +): ResolvedTask { + const prompt = registry.get(task.type); + if (!prompt) { + throw new Error(`No agent prompt registered for task type "${task.type}"`); + } + + const body = [ + renderInputs(task), + prompt.body, + renderHandoffContext(task, store), + ] + .filter(Boolean) + .join('\n\n'); + + return { + model: task.model ?? prompt.model ?? DEFAULT_TASK_MODEL, + ...agentRunTools(prompt), + prompt: body, + skills: prompt.skills, + }; +} diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts new file mode 100644 index 00000000..978a8f31 --- /dev/null +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -0,0 +1,296 @@ +/** + * Experimental task-queue orchestrator runner. + * + * Branches from the linear runner when the `wizard-orchestrator` flag is on. An + * orchestrator agent inspects the repo and seeds an in-memory task queue; an + * executor drains it, running one fresh agent per task. + * + * Both the WHAT (agent prompts: model, goal, success criteria, tools) and the + * HOW (mini-skills) are markdown served from context-mill — the seed and every + * task resolve to a prompt fetched at startup into the registry. The wizard side + * stays product-ignorant: it is the queue, the executor, and the loader. + */ +import { randomUUID } from 'crypto'; +import { existsSync } from 'fs'; +import * as path from 'path'; +import { + initializeAgent, + runAgent, + type AgentConfig, +} from '../../agent/agent-interface'; +import { OutroKind, type WizardSession } from '../../wizard-session'; +import { detectNodePackageManagers } from '../../detection/package-manager'; +import { installSkillById } from '../../wizard-tools'; +import { getUI } from '../../../ui'; +import { analytics } from '../../../utils/analytics'; +import { logToFile } from '../../../utils/debug'; +import type { ProgramConfig } from '../program-step'; +import type { BootstrapResult } from '../../agent/agent-runner'; +import type { WizardRunOptions } from '../../../utils/types'; +import { QueueStore, QUEUE_DIR_NAME, TaskStatus } from './queue'; +import { drainQueue, type RunTask } from './executor'; +import { + agentRunTools, + assembleSeedPrompt, + assembleTaskPrompt, + loadAgentRegistry, + resolveTask, + type OrchestratorPromptContext, +} from './agent-prompt-loader'; + +function toTodoStatus(status: TaskStatus): string { + switch (status) { + case TaskStatus.Running: + return 'in_progress'; + case TaskStatus.Done: + case TaskStatus.Failed: + return 'completed'; + case TaskStatus.Skipped: + return 'skipped'; + default: + return 'pending'; + } +} + +function sessionRunOptions(session: WizardSession): WizardRunOptions { + return { + installDir: session.installDir, + debug: session.debug, + default: false, + signup: session.signup, + localMcp: session.localMcp, + ci: session.ci, + benchmark: session.benchmark, + projectId: session.projectId, + apiKey: session.apiKey, + yaraReport: session.yaraReport, + }; +} + +export async function runOrchestrator( + session: WizardSession, + programConfig: ProgramConfig, + boot: BootstrapResult, +): Promise { + const runId = randomUUID(); + const store = new QueueStore(session.installDir, runId); + + const options = sessionRunOptions(session); + + // The WHAT (agent prompts) is served from context-mill. Fetch the registry + // once up front: its types drive enqueue validation, and resolving a task to + // its run config is then synchronous, with no mid-drain network latency. + const registry = await loadAgentRegistry( + boot.skillsBaseUrl, + programConfig.id, + ); + const seedPrompt = registry.seed; + if (!seedPrompt) { + throw new Error( + `No seed agent prompt (frontmatter \`seed: true\`) for flow "${programConfig.id}" is available from ${boot.skillsBaseUrl}.`, + ); + } + + // Give task agents the framework's finished reference integration to match, + // the same EXAMPLE.md the linear flow uses. Install it under the run dir rather + // than .claude/skills so its "do everything" workflow is not auto-loaded as a + // skill — only the example file is read, when the agent's prompt points at it. + let examplePath: string | undefined; + let commandmentsPath: string | undefined; + if (session.skillId) { + const ref = await installSkillById( + session.skillId, + session.installDir, + boot.skillsBaseUrl, + path.join(QUEUE_DIR_NAME, 'reference'), + ); + if (ref.kind === 'ok') { + const example = path.join(ref.path, 'references', 'EXAMPLE.md'); + if (existsSync(path.join(session.installDir, example))) { + examplePath = example; + } + const commandments = path.join(ref.path, 'references', 'COMMANDMENTS.md'); + if (existsSync(path.join(session.installDir, commandments))) { + commandmentsPath = commandments; + } + } else { + logToFile(`[orchestrator] reference example unavailable: ${ref.kind}`); + } + } + + // The client injects the basics (project context + the I/O contract) around + // every authored agent-prompt body. + const promptContext: OrchestratorPromptContext = { + projectId: boot.projectId, + projectApiKey: boot.projectApiKey, + host: boot.host, + examplePath, + commandmentsPath, + }; + + logToFile( + `[orchestrator] START program=${programConfig.id} dir=${session.installDir} run=${runId}`, + ); + analytics.wizardCapture('orchestrator started', { + program_id: programConfig.id, + }); + getUI().startRun(); + + // Label precedence: what the orchestrator set at enqueue, then the agent + // prompt's default, then the bare type. + const labelFor = (t: { type: string; label?: string }) => + t.label ?? registry.get(t.type)?.label ?? t.type; + const renderQueue = () => + getUI().syncTodos( + store.list().map((t) => ({ + content: labelFor(t), + status: toTodoStatus(t.status), + activeForm: labelFor(t), + })), + ); + + // Each agent gets its own config so its wizard-tools server is bound to the + // task it runs — independent tasks run in parallel, and attribution of + // complete_task / enqueue_task must hold per agent. The seed is not a task, + // so its context has no task id. + const agentConfigFor = (currentTaskId?: string): AgentConfig => ({ + workingDirectory: session.installDir, + posthogMcpUrl: boot.mcpUrl, + posthogApiKey: boot.accessToken, + posthogApiHost: boot.host, + detectPackageManager: detectNodePackageManagers, + skillsBaseUrl: boot.skillsBaseUrl, + wizardFlags: boot.wizardFlags, + // Tag agent events as orchestrator so telemetry segments from the baseline. + wizardMetadata: { ...boot.wizardMetadata, VARIANT: 'orchestrator' }, + integrationLabel: programConfig.id, + orchestrator: { + store, + validTypes: registry.types, + currentTaskId, + }, + }); + + const spinner = getUI().spinner(); + + // 1. Seed the queue with the orchestrator agent. It is itself an agent prompt + // (the WHAT), so its model and tools come from its frontmatter. The seed + // plans the graph, it is not a task. + const seedAgent = await initializeAgent(agentConfigFor(), options); + const seedResult = await runAgent( + { + ...seedAgent, + model: seedPrompt.model ?? seedAgent.model, + ...agentRunTools(seedPrompt), + }, + assembleSeedPrompt(promptContext, seedPrompt.body), + options, + spinner, + { + spinnerMessage: 'Planning the integration...', + successMessage: 'Planned the integration', + additionalFeatureQueue: [], + requestRemark: false, + }, + ); + if (seedResult.error) { + logToFile( + `[orchestrator] seed error: ${seedResult.error} ${ + seedResult.message ?? '' + }`, + ); + } + analytics.wizardCapture('orchestrator seeded', { + task_count: store.list().length, + types: store.list().map((t) => t.type), + }); + renderQueue(); + + // 2. Drain the queue, one fresh agent per task; independent tasks run in + // parallel, the seed's graph being the only schedule. Each task resolves to + // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then + // runs on its own model and tools. + const runTask: RunTask = async (task) => { + renderQueue(); + try { + const resolved = resolveTask(registry, task, store); + const agent = await initializeAgent(agentConfigFor(task.id), options); + for (const skillId of resolved.skills) { + const result = await installSkillById( + skillId, + session.installDir, + boot.skillsBaseUrl, + ); + if (result.kind !== 'ok') { + logToFile( + `[orchestrator] skill install failed type=${task.type} skill=${skillId} ${result.kind}`, + ); + } + } + await runAgent( + { + ...agent, + model: resolved.model, + allowedTools: resolved.allowedTools, + disallowedTools: resolved.disallowedTools, + }, + assembleTaskPrompt(promptContext, resolved.prompt), + options, + spinner, + // Empty messages suppress the per-task spinner lines (the spinner renders + // only when a message is set); the queue panel shows progress. Errors + // still surface — runAgent stops the spinner with its own error text. + // No per-task remark — the reflection would fire on every task. + { + spinnerMessage: '', + successMessage: '', + additionalFeatureQueue: [], + requestRemark: false, + }, + ); + } finally { + renderQueue(); + } + }; + await drainQueue(store, runTask); + + renderQueue(); + + const summary = store.summary(); + logToFile( + `[orchestrator] DONE done=${summary.done} failed=${summary.failed} total=${summary.total}`, + ); + analytics.wizardCapture('orchestrator run finished', { + tasks_total: summary.total, + tasks_done: summary.done, + tasks_failed: summary.failed, + }); + + // The build step flags any unresolved conflict in its handoff; surface the + // one-liner here and point the user at the report for the detail. + const buildTask = store.list().find((t) => t.type === 'build'); + const conflict = buildTask + ? store.readHandoff(buildTask.id)?.conflict + : undefined; + + // Prefer the report the run wrote; fall back to the raw queue if it is missing. + const reportPath = path.join(session.installDir, 'posthog-setup-report.md'); + const reportFile = existsSync(reportPath) + ? 'posthog-setup-report.md' + : store.queuePath; + + const message = conflict + ? 'PostHog set up, with one conflict to review.' + : `PostHog set up: ${summary.done}/${summary.total} steps completed.`; + getUI().setOutroData({ + kind: OutroKind.Success, + message, + body: conflict + ? `⚠ Build conflict: ${conflict}\nFull details are in the report.` + : undefined, + reportFile, + docsUrl: 'https://posthog.com/docs/ai-engineering/ai-wizard', + }); + getUI().outro(message); + await analytics.shutdown('success'); +} diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts index 6d3aad52..64e5bc93 100644 --- a/src/lib/programs/orchestrator/queue-tools.ts +++ b/src/lib/programs/orchestrator/queue-tools.ts @@ -29,6 +29,7 @@ export interface OrchestratorToolsContext { export interface EnqueueArgs { type: string; + label?: string; inputs?: Record; dependsOn?: string[]; model?: string; @@ -115,6 +116,7 @@ export function applyEnqueue( const task = ctx.store.enqueue({ type: args.type, + label: args.label, inputs: args.inputs ?? {}, dependsOn: args.dependsOn ?? [], model: args.model, @@ -175,6 +177,12 @@ const HANDOFF_SHAPE = { did: z.string().describe('What you actually did.'), forNextAgent: z.string().describe('What the next agent should know.'), filesTouched: z.array(z.string()).optional(), + conflict: z + .string() + .optional() + .describe( + 'A one-line summary of any conflict you could not cleanly resolve (e.g. a dependency or build conflict). Put full detail in your work; this line is surfaced to the user.', + ), }; type SdkTool = ( @@ -204,6 +212,12 @@ export function buildOrchestratorTools( type: z .string() .describe(`The task type. One of: ${ctx.validTypes.join(', ')}.`), + label: z + .string() + .optional() + .describe( + 'A short label for the UI — the action in a few words (e.g. "Add the PostHog SDK", "Initialize PostHog"). Leave out file names, class names, and other specifics.', + ), inputs: z.record(z.unknown()).optional(), dependsOn: z .array(z.string()) diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 5f62c718..4ecc3cb5 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -28,6 +28,8 @@ export type TaskStatus = (typeof TaskStatus)[keyof typeof TaskStatus]; export interface QueuedTask { id: string; type: string; + /** Human-readable label for the TUI, set by the enqueuing agent. */ + label?: string; status: TaskStatus; dependsOn: string[]; inputs: Record; @@ -56,10 +58,13 @@ export interface TaskHandoff { did: string; forNextAgent: string; filesTouched?: string[]; + /** A one-line summary of any unresolved conflict, surfaced in the outro. */ + conflict?: string; } export interface EnqueueInput { type: string; + label?: string; inputs?: Record; dependsOn?: string[]; model?: string; @@ -155,6 +160,7 @@ export class QueueStore { const task: QueuedTask = { id: randomUUID(), type: input.type, + label: input.label, status: TaskStatus.Pending, dependsOn: input.dependsOn ?? [], inputs: input.inputs ?? {}, diff --git a/src/lib/task-stream/task-stream-push.ts b/src/lib/task-stream/task-stream-push.ts index cecd9ff8..02815419 100644 --- a/src/lib/task-stream/task-stream-push.ts +++ b/src/lib/task-stream/task-stream-push.ts @@ -37,6 +37,8 @@ const STATUS_MAP: Record = { [TaskStatus.Pending]: StreamTaskStatus.Pending, [TaskStatus.InProgress]: StreamTaskStatus.InProgress, [TaskStatus.Completed]: StreamTaskStatus.Completed, + // The stream has no skipped state; skipped is terminal, so report it resolved. + [TaskStatus.Skipped]: StreamTaskStatus.Completed, }; function buildTasks(items: TaskItem[]): StreamTask[] { diff --git a/src/ui/logging-ui.ts b/src/ui/logging-ui.ts index fd0c34f1..9ae0a2ee 100644 --- a/src/ui/logging-ui.ts +++ b/src/ui/logging-ui.ts @@ -232,20 +232,22 @@ export class LoggingUI implements WizardUI { // the session. } + private lastTodoLine = ''; + syncTodos( todos: Array<{ content: string; status: string; activeForm?: string }>, ): void { const completed = todos.filter( (t) => t.status === TaskStatus.Completed, ).length; - const inProgress = todos.find((t) => t.status === TaskStatus.InProgress); - if (inProgress) { - console.log( - `◌ [${completed}/${todos.length}] ${ - inProgress.activeForm || inProgress.content - }`, - ); - } + const active = todos.filter((t) => t.status === TaskStatus.InProgress); + if (active.length === 0) return; + const labels = active.map((t) => t.activeForm || t.content).join(' · '); + const line = `◌ [${completed}/${todos.length}] ${labels}`; + // The queue re-renders on every transition; print only what changed. + if (line === this.lastTodoLine) return; + this.lastTodoLine = line; + console.log(line); } setEventPlan(_events: Array<{ name: string; description: string }>): void { diff --git a/src/ui/tui/primitives/ProgressList.tsx b/src/ui/tui/primitives/ProgressList.tsx index b72156c8..3c84c8ee 100644 --- a/src/ui/tui/primitives/ProgressList.tsx +++ b/src/ui/tui/primitives/ProgressList.tsx @@ -11,7 +11,7 @@ import { LoadingBox } from './LoadingBox.js'; export interface ProgressItem { label: string; activeForm?: string; - status: 'pending' | 'in_progress' | 'completed'; + status: 'pending' | 'in_progress' | 'completed' | 'skipped'; } interface ProgressListProps { @@ -20,7 +20,9 @@ interface ProgressListProps { } export const ProgressList = ({ items, title }: ProgressListProps) => { - const completed = items.filter((t) => t.status === 'completed').length; + const resolved = items.filter( + (t) => t.status === 'completed' || t.status === 'skipped', + ).length; const total = items.length; return ( @@ -33,6 +35,7 @@ export const ProgressList = ({ items, title }: ProgressListProps) => { )} {items.length === 0 && } {items.map((item, i) => { + const skipped = item.status === 'skipped'; const icon = item.status === 'completed' ? Icons.squareFilled @@ -45,15 +48,22 @@ export const ProgressList = ({ items, title }: ProgressListProps) => { : item.status === 'in_progress' ? Colors.primary : Colors.muted; - const label = - item.status === 'in_progress' && item.activeForm - ? item.activeForm - : item.label; + const label = skipped + ? `${item.label} (skipped)` + : item.status === 'in_progress' && item.activeForm + ? item.activeForm + : item.label; return ( {icon} - {label} + + {' '} + {label} + ); })} @@ -61,8 +71,8 @@ export const ProgressList = ({ items, title }: ProgressListProps) => { - {completed < total - ? `Progress: ${completed}/${total} completed` + {resolved < total + ? `Progress: ${resolved}/${total} completed` : 'Cleaning up...'} diff --git a/src/ui/wizard-ui.ts b/src/ui/wizard-ui.ts index 97216262..cbb494d4 100644 --- a/src/ui/wizard-ui.ts +++ b/src/ui/wizard-ui.ts @@ -21,6 +21,7 @@ export enum TaskStatus { Pending = 'pending', InProgress = 'in_progress', Completed = 'completed', + Skipped = 'skipped', } export function isTaskStatus(value: string): value is TaskStatus { From f26d97840dca23cf4055c84e2f6990778be42d8a Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:22:34 -0400 Subject: [PATCH 06/12] feat(analytics): identify the user so feature flags can target by email (#620) Co-authored-by: Claude Opus 4.8 (1M context) --- src/lib/agent/agent-runner.ts | 3 ++ src/utils/analytics.ts | 58 ++++++++++++++++++++--------------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 07a89c0c..a08320ed 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -349,6 +349,9 @@ async function bootstrapProgram( getUI().setRoleAtOrganization(roleAtOrganization); getUI().setApiUser(user); + // Identify the user (email, name) before evaluating flags, so flags can target + // the individual user and not just $app_name. + if (user) analytics.identifyUser(user); analytics.setGroups(groupsFromUser(user, host)); // 4.5. AI opt-in enforcement. Parks here while AiOptInRequiredScreen is diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index 48a0717b..bf849a7e 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -8,7 +8,7 @@ import type { WizardSession } from '@lib/wizard-session'; import type { ApiUser } from '@lib/api'; import { v4 as uuidv4 } from 'uuid'; import { IS_PRODUCTION_BUILD } from '@env'; -import { debug } from './debug'; +import { debug, logToFile } from './debug'; /** * Extract a standard property bag from the current session. @@ -58,6 +58,7 @@ export class Analytics { private appName = 'wizard'; private activeFlags: Record | null = null; private groups: Record = {}; + private personProperties: Record = {}; constructor() { this.client = new PostHog(ANALYTICS_POSTHOG_PUBLIC_PROJECT_WRITE_KEY, { @@ -107,10 +108,12 @@ export class Analytics { } /** - * Associate the run with the logged-in user, once per id: identify them - * (email, name), then alias the run's anonymous id onto the identified - * person so pre-login events merge in. Alias only ever fires after - * identification. + * Associate the run with the logged-in user, once per id. Identifies them + * (email, name) and records those person properties so events carry them and + * feature flags can target the individual user — without the email here the + * wizard only sends `$app_name`, so email-targeted flags never match. Opens + * the analytics session on first login, then aliases the run's anonymous id + * onto the identified person so pre-login events merge in. */ identifyUser(user: ApiUser) { const distinctId = user.distinct_id; @@ -127,25 +130,28 @@ export class Analytics { this.sessionId = uuidv4(); this.tags.$session_id = this.sessionId; } - this.client.identify({ - distinctId, - properties: { - $set: { - ...(user.email ? { email: user.email } : {}), - ...(user.first_name || user.last_name - ? { - name: [user.first_name, user.last_name] - .filter(Boolean) - .join(' '), - } - : {}), - }, - }, - }); + const props: Record = {}; + if (user.email) props.email = user.email; + const name = [user.first_name, user.last_name] + .filter(Boolean) + .join(' ') + .trim(); + if (name) props.name = name; + this.personProperties = props; + this.client.identify({ distinctId, properties: { $set: props } }); this.client.alias({ distinctId, alias: this.anonymousId, }); + // The flag snapshot is per identity. Anything evaluated before login (the + // intro screen reads the tools-menu flag) was anonymous — drop it so the + // next read re-evaluates as this user. + this.activeFlags = null; + } + + /** Person properties sent with flag evaluation: app name plus the user's. */ + private flagPersonProperties(): Record { + return { $app_name: this.appName, ...this.personProperties }; } setTag(key: string, value: string | boolean | number | null | undefined) { @@ -198,9 +204,7 @@ export class Analytics { const distinctId = this.distinctId ?? this.anonymousId; return await this.client.getFeatureFlag(flagKey, distinctId, { sendFeatureFlagEvents: true, - personProperties: { - $app_name: this.appName, - }, + personProperties: this.flagPersonProperties(), }); } catch (error) { debug('Failed to get feature flag:', flagKey, error); @@ -219,8 +223,13 @@ export class Analytics { } try { const distinctId = this.distinctId ?? this.anonymousId; + logToFile('[flags] evaluating as', { + distinctId, + identified: this.distinctId !== undefined, + personProperties: this.flagPersonProperties(), + }); const result = await this.client.getAllFlagsAndPayloads(distinctId, { - personProperties: { $app_name: this.appName }, + personProperties: this.flagPersonProperties(), }); const flags = result.featureFlags ?? {}; const out: Record = {}; @@ -229,6 +238,7 @@ export class Analytics { out[key] = typeof value === 'boolean' ? String(value) : String(value); } this.activeFlags = out; + logToFile('[flags] evaluated', out); return out; } catch (error) { debug('Failed to get all feature flags:', error); From 1d7127d9dafe936a71667b898fb94d3ba6a2757b Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:22:43 -0400 Subject: [PATCH 07/12] feat(ci): flag overrides that exist only in CI builds (#635) Co-authored-by: Claude Opus 4.8 (1M context) --- scripts/smoke-test.sh | 33 +++++++++- src/env.ts | 4 ++ src/utils/__tests__/ci-flag-overrides.test.ts | 63 +++++++++++++++++++ src/utils/analytics.ts | 17 +++-- src/utils/ci-flag-overrides.ts | 46 ++++++++++++++ 5 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 src/utils/__tests__/ci-flag-overrides.test.ts create mode 100644 src/utils/ci-flag-overrides.ts diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh index 5abaaca9..1e9a690b 100755 --- a/scripts/smoke-test.sh +++ b/scripts/smoke-test.sh @@ -19,7 +19,38 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH exit 1 } -# ── 2. --ci rejected in production builds ──────────────────────────────────── +# ── 2. CI flag overrides physically absent from production builds ─────────── +# The override path (src/utils/ci-flag-overrides.ts) is dead code in published +# builds and tsdown strips it; its env var name appearing in dist/*.js means +# dead-code elimination regressed and a prod surface leaked. Sourcemaps keep +# the original source, so only .js output counts. +OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES' +if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then + # CI builds must keep the path — its absence means the override silently + # stopped working and CI is back to testing live flags. + if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then + echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2 + exit 1 + fi + # And a real invocation must accept the env var. yargs claims every + # POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects + # unknown ones during command parse (--version/--help short-circuit and + # prove nothing). The run exits fast on the missing api key — all this + # asserts is that yargs did not reject the environment. + ci_probe=$(WIZARD_CI_FLAG_OVERRIDES='{"wizard-orchestrator":true}' node "$DIST_BIN" --ci --install-dir /tmp/wizard-smoke-probe 2>&1) || true + if echo "$ci_probe" | grep -q 'Unknown argument'; then + echo 'Smoke test failed: CI binary rejects WIZARD_CI_FLAG_OVERRIDES in the environment' >&2 + echo "$ci_probe" | head -3 >&2 + exit 1 + fi +else + if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then + echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2 + exit 1 + fi +fi + +# ── 3. --ci rejected in production builds ──────────────────────────────────── # build:ci sets WIZARD_BUILD_NODE_ENV=ci → --ci stays enabled → skip the check. if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then exit 0 diff --git a/src/env.ts b/src/env.ts index 6eec7cad..c32e886a 100644 --- a/src/env.ts +++ b/src/env.ts @@ -39,6 +39,10 @@ export const IS_PRODUCTION_BUILD = process.env.NODE_ENV === 'production'; * Add new keys here when a new runtime dependency is needed. */ type RuntimeEnvKey = + // CI-build-only flag overrides (see utils/ci-flag-overrides.ts). + // Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD') + // would claim it as an unknown CLI option and strict-reject the run. + | 'WIZARD_CI_FLAG_OVERRIDES' // Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix) | 'POSTHOG_WIZARD_BENCHMARK_CONFIG' | 'POSTHOG_WIZARD_BENCHMARK_FILE' diff --git a/src/utils/__tests__/ci-flag-overrides.test.ts b/src/utils/__tests__/ci-flag-overrides.test.ts new file mode 100644 index 00000000..4d2333a1 --- /dev/null +++ b/src/utils/__tests__/ci-flag-overrides.test.ts @@ -0,0 +1,63 @@ +import { applyCiFlagOverrides } from '@utils/ci-flag-overrides'; + +jest.mock('@utils/debug', () => ({ + logToFile: jest.fn(), + debug: jest.fn(), +})); + +const ENV_KEY = 'WIZARD_CI_FLAG_OVERRIDES'; + +describe('applyCiFlagOverrides', () => { + afterEach(() => { + delete process.env[ENV_KEY]; + }); + + // Jest runs with NODE_ENV=test, so IS_PRODUCTION_BUILD is false and the + // override path is live — the same shape a `build:ci` bundle has. + describe('in CI builds', () => { + it('returns the flags untouched when no override is set', () => { + const flags = { 'wizard-orchestrator': 'false' }; + expect(applyCiFlagOverrides(flags)).toEqual(flags); + }); + + it('merges overrides over the fetched flags, stringifying values', () => { + process.env[ENV_KEY] = JSON.stringify({ + 'wizard-orchestrator': true, + 'wizard-next-v2': 'legacy', + }); + expect( + applyCiFlagOverrides({ + 'wizard-orchestrator': 'false', + 'wizard-react-router': 'true', + }), + ).toEqual({ + 'wizard-orchestrator': 'true', + 'wizard-next-v2': 'legacy', + 'wizard-react-router': 'true', + }); + }); + + it('fails loudly on malformed JSON instead of testing live flags', () => { + process.env[ENV_KEY] = 'wizard-orchestrator=true'; + expect(() => applyCiFlagOverrides({})).toThrow(/not valid JSON/); + }); + }); + + describe('in production builds', () => { + it('is inert: overrides are ignored even when the env var is set', () => { + const prevNodeEnv = process.env.NODE_ENV; + process.env.NODE_ENV = 'production'; + process.env[ENV_KEY] = JSON.stringify({ 'wizard-orchestrator': true }); + let result: Record | undefined; + jest.isolateModules(() => { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const prod = require('@utils/ci-flag-overrides') as { + applyCiFlagOverrides: typeof applyCiFlagOverrides; + }; + result = prod.applyCiFlagOverrides({ 'wizard-orchestrator': 'false' }); + }); + process.env.NODE_ENV = prevNodeEnv; + expect(result).toEqual({ 'wizard-orchestrator': 'false' }); + }); + }); +}); diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index bf849a7e..d034f949 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -9,6 +9,7 @@ import type { ApiUser } from '@lib/api'; import { v4 as uuidv4 } from 'uuid'; import { IS_PRODUCTION_BUILD } from '@env'; import { debug, logToFile } from './debug'; +import { applyCiFlagOverrides } from './ci-flag-overrides'; /** * Extract a standard property bag from the current session. @@ -221,6 +222,7 @@ export class Analytics { if (this.activeFlags !== null) { return this.activeFlags; } + const out: Record = {}; try { const distinctId = this.distinctId ?? this.anonymousId; logToFile('[flags] evaluating as', { @@ -232,18 +234,23 @@ export class Analytics { personProperties: this.flagPersonProperties(), }); const flags = result.featureFlags ?? {}; - const out: Record = {}; for (const [key, value] of Object.entries(flags)) { if (value === undefined) continue; out[key] = typeof value === 'boolean' ? String(value) : String(value); } - this.activeFlags = out; - logToFile('[flags] evaluated', out); - return out; } catch (error) { debug('Failed to get all feature flags:', error); - return {}; + this.captureException( + error instanceof Error ? error : new Error(String(error)), + { step: 'get_all_flags' }, + ); } + // Outside the fetch guard on purpose: a malformed CI override must fail + // the run loudly, and a valid one applies even when the fetch failed — + // CI routing stays deterministic either way. + this.activeFlags = applyCiFlagOverrides(out); + logToFile('[flags] evaluated', this.activeFlags); + return this.activeFlags; } async shutdown(status: 'success' | 'error' | 'cancelled') { diff --git a/src/utils/ci-flag-overrides.ts b/src/utils/ci-flag-overrides.ts new file mode 100644 index 00000000..e8790e23 --- /dev/null +++ b/src/utils/ci-flag-overrides.ts @@ -0,0 +1,46 @@ +/** + * CI-only feature-flag overrides. + * + * CI must route deterministically: a run that tests the orchestrator arm says + * so explicitly instead of depending on a live feature flag someone can edit + * mid-week. `WIZARD_CI_FLAG_OVERRIDES` is a JSON object of flag key → + * value, merged over whatever PostHog returned. + * + * The override path exists only in CI builds (`pnpm build:ci`). Published + * builds inline NODE_ENV as the literal "production", the guard below + * collapses, and tsdown strips the rest from the bundle — and the smoke test + * asserts the env var's name is physically absent from production output, so + * this can never quietly become a production surface. + */ +import { runtimeEnv } from '@env'; +import { logToFile } from './debug'; + +export function applyCiFlagOverrides( + flags: Record, +): Record { + // Compared inline (not via env.ts's IS_PRODUCTION_BUILD) so tsdown replaces + // it with a literal right here and the bundler can prove the rest of this + // function unreachable in production builds. The smoke test enforces that. + if (process.env.NODE_ENV === 'production') return flags; + + const raw = runtimeEnv('WIZARD_CI_FLAG_OVERRIDES'); + if (!raw) return flags; + + let overrides: Record; + try { + overrides = JSON.parse(raw) as Record; + } catch { + // A malformed override is a CI misconfiguration. Fail the run loudly + // rather than silently testing whatever the live flags happen to say. + throw new Error( + 'WIZARD_CI_FLAG_OVERRIDES is not valid JSON (expected {"flag-key": value, ...}).', + ); + } + + const merged = { ...flags }; + for (const [key, value] of Object.entries(overrides)) { + merged[key] = String(value); + } + logToFile('[flags] CI overrides applied', overrides); + return merged; +} From 3b69e46c78d02a08a7f734ff134b424723980d31 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:22:53 -0400 Subject: [PATCH 08/12] feat(orchestrator): task instructions are ephemeral, not keepable skills (#637) Co-authored-by: Claude Opus 4.8 (1M context) --- .../__tests__/agent-prompt-loader.test.ts | 24 ++++++++++++++++++ .../orchestrator/agent-prompt-loader.ts | 13 ++++++++++ .../orchestrator/orchestrator-runner.ts | 25 ++++++++++++++++--- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts index 64a4bdab..8252e791 100644 --- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts @@ -3,11 +3,13 @@ import * as os from 'os'; import * as path from 'path'; import { agentRunTools, + assembleTaskPrompt, buildRegistry, parseAgentPrompt, resolveTask, type AgentPrompt, type AgentRegistry, + type OrchestratorPromptContext, } from '../agent-prompt-loader'; import { QueueStore } from '../queue'; @@ -203,3 +205,25 @@ describe('resolveTask', () => { ); }); }); + +describe('assembleTaskPrompt', () => { + const ctx: OrchestratorPromptContext = { + projectId: 1, + projectApiKey: 'phc_x', + host: 'https://us.posthog.com', + }; + + it('points the agent at its installed task instructions', () => { + const assembled = assembleTaskPrompt(ctx, 'do the task', [ + '.posthog-wizard/skills/capture/SKILL.md', + ]); + expect(assembled).toContain('.posthog-wizard/skills/capture/SKILL.md'); + expect(assembled).toContain('do the task'); + }); + + it('omits the instructions section when no skills are installed', () => { + expect(assembleTaskPrompt(ctx, 'do the task')).not.toContain( + 'task instructions', + ); + }); +}); diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts index ee351db8..902adaee 100644 --- a/src/lib/programs/orchestrator/agent-prompt-loader.ts +++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts @@ -49,6 +49,17 @@ function exampleReference(ctx: OrchestratorPromptContext): string | null { return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`; } +/** + * Points the agent at its installed task instructions (the HOW). They live under + * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load + * them — the prompt has to name them. + */ +function skillReference(paths: readonly string[]): string | null { + if (paths.length === 0) return null; + const list = paths.map((p) => `\`${p}\``).join(', '); + return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`; +} + /** The framework's rules ship with the reference skill; every task follows them. */ function commandmentsReference(ctx: OrchestratorPromptContext): string | null { if (!ctx.commandmentsPath) return null; @@ -63,11 +74,13 @@ const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue export function assembleTaskPrompt( ctx: OrchestratorPromptContext, body: string, + skillPaths: readonly string[] = [], ): string { return [ projectContext(ctx), exampleReference(ctx), commandmentsReference(ctx), + skillReference(skillPaths), TASK_BASICS, body, ] diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index 978a8f31..8f2aac90 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -11,7 +11,7 @@ * stays product-ignorant: it is the queue, the executor, and the loader. */ import { randomUUID } from 'crypto'; -import { existsSync } from 'fs'; +import { existsSync, rmSync } from 'fs'; import * as path from 'path'; import { initializeAgent, @@ -210,18 +210,27 @@ export async function runOrchestrator( // parallel, the seed's graph being the only schedule. Each task resolves to // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then // runs on its own model and tools. + const taskSkillsRoot = path.join(QUEUE_DIR_NAME, 'skills'); const runTask: RunTask = async (task) => { renderQueue(); try { const resolved = resolveTask(registry, task, store); const agent = await initializeAgent(agentConfigFor(task.id), options); + // Task instructions are one-run scaffolding, not durable skills, so they + // install under the run dir rather than .claude/skills — the SDK must not + // auto-load them and they must never land in the project (or a CI PR). + // The prompt points the agent at them instead. + const skillPaths: string[] = []; for (const skillId of resolved.skills) { const result = await installSkillById( skillId, session.installDir, boot.skillsBaseUrl, + taskSkillsRoot, ); - if (result.kind !== 'ok') { + if (result.kind === 'ok') { + skillPaths.push(path.join(result.path, 'SKILL.md')); + } else { logToFile( `[orchestrator] skill install failed type=${task.type} skill=${skillId} ${result.kind}`, ); @@ -234,7 +243,7 @@ export async function runOrchestrator( allowedTools: resolved.allowedTools, disallowedTools: resolved.disallowedTools, }, - assembleTaskPrompt(promptContext, resolved.prompt), + assembleTaskPrompt(promptContext, resolved.prompt, skillPaths), options, spinner, // Empty messages suppress the per-task spinner lines (the spinner renders @@ -252,7 +261,15 @@ export async function runOrchestrator( renderQueue(); } }; - await drainQueue(store, runTask); + try { + await drainQueue(store, runTask); + } finally { + // Success or failure, the installed task instructions never outlive the run. + rmSync(path.join(session.installDir, taskSkillsRoot), { + recursive: true, + force: true, + }); + } renderQueue(); From 80d6250b3a653374e94948481e7b708b36318fbe Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:23:03 -0400 Subject: [PATCH 09/12] =?UTF-8?q?feat(orchestrator):=20run=20telemetry=20?= =?UTF-8?q?=E2=80=94=20the=20responsiveness=20A/B=20spine=20(#638)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Claude Opus 4.8 (1M context) --- src/lib/agent/agent-interface.ts | 25 +++++ src/lib/agent/agent-runner.ts | 3 + .../__tests__/agent-prompt-loader.test.ts | 18 ++++ .../orchestrator/__tests__/queue.test.ts | 40 ++++++++ .../orchestrator/agent-prompt-loader.ts | 27 ++--- .../orchestrator/orchestrator-runner.ts | 99 ++++++++++++++++++- src/lib/programs/orchestrator/queue.ts | 49 ++++++++- 7 files changed, 245 insertions(+), 16 deletions(-) diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts index 07342385..f80e7857 100644 --- a/src/lib/agent/agent-interface.ts +++ b/src/lib/agent/agent-interface.ts @@ -678,6 +678,11 @@ export async function runAgent( abortCases?: readonly AbortCaseMatcher[]; /** Request the end-of-run reflection remark. Defaults to true. */ requestRemark?: boolean; + /** + * Extra properties attached to this run's `agent completed` / `agent + * aborted` events (e.g. the orchestrator's task type and id). + */ + analyticsProperties?: Record; }, middleware?: { onMessage(message: any): void; @@ -756,9 +761,27 @@ export async function runAgent( analytics.capture(WIZARD_REMARK_EVENT_NAME, { remark }); } + // Token usage comes from the SDK result message and is per agent run — + // for the orchestrator that means per task, the secondary cost to watch. + const usage = lastResultMessage?.usage as + | { + input_tokens?: number; + output_tokens?: number; + cache_creation_input_tokens?: number; + cache_read_input_tokens?: number; + } + | undefined; analytics.wizardCapture('agent completed', { duration_ms: durationMs, duration_seconds: durationSeconds, + model: agentConfig.model, + num_turns: lastResultMessage?.num_turns, + total_cost_usd: lastResultMessage?.total_cost_usd, + input_tokens: usage?.input_tokens, + output_tokens: usage?.output_tokens, + cache_creation_input_tokens: usage?.cache_creation_input_tokens, + cache_read_input_tokens: usage?.cache_read_input_tokens, + ...config?.analyticsProperties, }); try { middleware?.finalize(lastResultMessage, durationMs); @@ -1172,6 +1195,8 @@ export async function runAgent( analytics.wizardCapture('agent aborted', { duration_ms: durationMs, duration_seconds: Math.round(durationMs / 1000), + model: agentConfig.model, + ...config?.analyticsProperties, }); } } diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index a08320ed..2d693058 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -368,6 +368,9 @@ async function bootstrapProgram( // fork decision reads the flags. const wizardFlags = await analytics.getAllFlagsForWizard(); const wizardMetadata = buildWizardMetadata(wizardFlags); + // Tag every wizard event with the variant so runs segment in PostHog; the + // orchestrator arm overwrites this with its own variant when it forks. + analytics.setTag('variant', wizardMetadata.VARIANT); const mcpUrl = session.localMcp ? 'http://localhost:8787/mcp' diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts index 8252e791..22ce11af 100644 --- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts @@ -7,6 +7,7 @@ import { buildRegistry, parseAgentPrompt, resolveTask, + taskModel, type AgentPrompt, type AgentRegistry, type OrchestratorPromptContext, @@ -206,6 +207,23 @@ describe('resolveTask', () => { }); }); +describe('taskModel', () => { + const prompt = parseAgentPrompt( + '---\nmodel: prompt-model\n---\nx', + 'capture', + ); + + it('prefers the enqueue override, then the prompt, then the default', () => { + const registry = registryOf([prompt]); + const task = { type: 'capture' }; + expect(taskModel(registry, { ...task, model: 'override' } as never)).toBe( + 'override', + ); + expect(taskModel(registry, task as never)).toBe('prompt-model'); + expect(taskModel(registryOf([]), task as never)).toBe('claude-sonnet-4-6'); + }); +}); + describe('assembleTaskPrompt', () => { const ctx: OrchestratorPromptContext = { projectId: 1, diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts index 4a18dee2..7f34f283 100644 --- a/src/lib/programs/orchestrator/__tests__/queue.test.ts +++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts @@ -7,6 +7,10 @@ import { type TaskHandoff, } from '@lib/programs/orchestrator/queue'; +jest.mock('@utils/analytics', () => ({ + analytics: { captureException: jest.fn(), wizardCapture: jest.fn() }, +})); + function tmpDir(): string { return fs.mkdtempSync(path.join(os.tmpdir(), 'queue-test-')); } @@ -132,4 +136,40 @@ describe('QueueStore', () => { expect(file.tasks[0].status).toBe('done'); expect(file.tasks[0].handoff?.did).toBe('d'); }); + + it('notifies the transition listener with post-transition task state', () => { + const seen: Array<{ event: string; status: string; attempts: number }> = []; + const listened = new QueueStore(dir, 'run-2', { + onTransition: (event, task) => + seen.push({ event, status: task.status, attempts: task.attempts }), + }); + + const t = listened.enqueue({ type: 'install' }); + listened.start(t.id); + listened.fail(t.id, { type: 'API_ERROR', message: 'boom' }); + listened.requeue(t.id); + listened.start(t.id); + listened.complete(t.id); + + expect(seen).toEqual([ + { event: 'enqueue', status: 'pending', attempts: 0 }, + { event: 'start', status: 'running', attempts: 1 }, + { event: 'fail', status: 'failed', attempts: 1 }, + { event: 'requeue', status: 'pending', attempts: 1 }, + { event: 'start', status: 'running', attempts: 2 }, + { event: 'complete', status: 'done', attempts: 2 }, + ]); + }); + + it('a throwing listener does not break transitions', () => { + const listened = new QueueStore(dir, 'run-3', { + onTransition: () => { + throw new Error('listener boom'); + }, + }); + const t = listened.enqueue({ type: 'install' }); + listened.start(t.id); + listened.complete(t.id); + expect(listened.get(t.id)?.status).toBe('done'); + }); }); diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts index 902adaee..3212a2c3 100644 --- a/src/lib/programs/orchestrator/agent-prompt-loader.ts +++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts @@ -49,6 +49,16 @@ function exampleReference(ctx: OrchestratorPromptContext): string | null { return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`; } +/** The framework's rules ship with the reference skill; every task follows them. */ +function commandmentsReference(ctx: OrchestratorPromptContext): string | null { + if (!ctx.commandmentsPath) return null; + return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; +} + +const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; + +const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; + /** * Points the agent at its installed task instructions (the HOW). They live under * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load @@ -60,16 +70,6 @@ function skillReference(paths: readonly string[]): string | null { return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`; } -/** The framework's rules ship with the reference skill; every task follows them. */ -function commandmentsReference(ctx: OrchestratorPromptContext): string | null { - if (!ctx.commandmentsPath) return null; - return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; -} - -const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; - -const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; - /** A task agent's full prompt: injected basics, then the authored intent. */ export function assembleTaskPrompt( ctx: OrchestratorPromptContext, @@ -315,9 +315,14 @@ export function resolveTask( .join('\n\n'); return { - model: task.model ?? prompt.model ?? DEFAULT_TASK_MODEL, + model: taskModel(registry, task), ...agentRunTools(prompt), prompt: body, skills: prompt.skills, }; } + +/** The model a task runs on: enqueue override, then prompt frontmatter, then default. */ +export function taskModel(registry: AgentRegistry, task: QueuedTask): string { + return task.model ?? registry.get(task.type)?.model ?? DEFAULT_TASK_MODEL; +} diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index 8f2aac90..b6129016 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -27,7 +27,12 @@ import { logToFile } from '../../../utils/debug'; import type { ProgramConfig } from '../program-step'; import type { BootstrapResult } from '../../agent/agent-runner'; import type { WizardRunOptions } from '../../../utils/types'; -import { QueueStore, QUEUE_DIR_NAME, TaskStatus } from './queue'; +import { + QueueStore, + QUEUE_DIR_NAME, + TaskStatus, + type QueuedTask, +} from './queue'; import { drainQueue, type RunTask } from './executor'; import { agentRunTools, @@ -35,6 +40,7 @@ import { assembleTaskPrompt, loadAgentRegistry, resolveTask, + taskModel, type OrchestratorPromptContext, } from './agent-prompt-loader'; @@ -73,7 +79,6 @@ export async function runOrchestrator( boot: BootstrapResult, ): Promise { const runId = randomUUID(); - const store = new QueueStore(session.installDir, runId); const options = sessionRunOptions(session); @@ -91,6 +96,74 @@ export async function runOrchestrator( ); } + // Every wizard event from here on carries the variant, so orchestrator runs + // segment cleanly from the linear baseline. + analytics.setTag('variant', 'orchestrator'); + + // Responsiveness is the headline metric of the dark launch: time to first + // visible progress, and no single step dominating wall-clock. Track it from + // queue transitions, with the resolved model so cheap work is attributable + // to cheap models. + const runStartMs = Date.now(); + let firstStartMs: number | undefined; + let lastStartMs: number | undefined; + const durationMs = (t: QueuedTask) => + t.startedAt && t.finishedAt + ? Date.parse(t.finishedAt) - Date.parse(t.startedAt) + : undefined; + + const store = new QueueStore(session.installDir, runId, { + onTransition: (event, task) => { + const base = { + type: task.type, + model: taskModel(registry, task), + attempts: task.attempts, + }; + switch (event) { + case 'enqueue': + analytics.wizardCapture('orchestrator task enqueued', { + type: task.type, + enqueued_by: task.enqueuedBy, + dynamic: task.enqueuedBy !== 'orchestrator', + }); + break; + case 'start': { + const now = Date.now(); + analytics.wizardCapture('orchestrator task started', { + ...base, + ms_since_run_start: now - runStartMs, + gap_since_prev_start_ms: + lastStartMs === undefined ? undefined : now - lastStartMs, + }); + firstStartMs ??= now; + lastStartMs = now; + break; + } + case 'complete': + analytics.wizardCapture('orchestrator task completed', { + ...base, + duration_ms: durationMs(task), + }); + break; + case 'skip': + analytics.wizardCapture('orchestrator task skipped', { + ...base, + duration_ms: durationMs(task), + }); + break; + case 'fail': + analytics.wizardCapture('orchestrator task failed', { + ...base, + duration_ms: durationMs(task), + error: task.error?.type, + }); + break; + case 'requeue': + break; + } + }, + }); + // Give task agents the framework's finished reference integration to match, // the same EXAMPLE.md the linear flow uses. Install it under the run dir rather // than .claude/skills so its "do everything" workflow is not auto-loaded as a @@ -191,6 +264,7 @@ export async function runOrchestrator( successMessage: 'Planned the integration', additionalFeatureQueue: [], requestRemark: false, + analyticsProperties: { task_type: 'seed' }, }, ); if (seedResult.error) { @@ -211,6 +285,7 @@ export async function runOrchestrator( // its agent prompt (the WHAT) and the mini-skills it needs (the HOW), then // runs on its own model and tools. const taskSkillsRoot = path.join(QUEUE_DIR_NAME, 'skills'); + let remarkRequested = false; const runTask: RunTask = async (task) => { renderQueue(); try { @@ -236,6 +311,18 @@ export async function runOrchestrator( ); } } + // The run-end reflection fires once, on the task that is last in the + // queue when it starts — nothing else pending or running alongside it. + const isLastTask = !store + .list() + .some( + (t) => + t.id !== task.id && + (t.status === TaskStatus.Pending || + t.status === TaskStatus.Running), + ); + const requestRemark = isLastTask && !remarkRequested; + if (requestRemark) remarkRequested = true; await runAgent( { ...agent, @@ -249,12 +336,12 @@ export async function runOrchestrator( // Empty messages suppress the per-task spinner lines (the spinner renders // only when a message is set); the queue panel shows progress. Errors // still surface — runAgent stops the spinner with its own error text. - // No per-task remark — the reflection would fire on every task. { spinnerMessage: '', successMessage: '', additionalFeatureQueue: [], - requestRemark: false, + requestRemark, + analyticsProperties: { task_type: task.type, task_id: task.id }, }, ); } finally { @@ -281,6 +368,10 @@ export async function runOrchestrator( tasks_total: summary.total, tasks_done: summary.done, tasks_failed: summary.failed, + tasks_skipped: summary.skipped, + total_duration_ms: Date.now() - runStartMs, + time_to_first_task_ms: + firstStartMs === undefined ? undefined : firstStartMs - runStartMs, }); // The build step flags any unresolved conflict in its handoff; surface the diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 4ecc3cb5..302897e6 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -14,6 +14,7 @@ import * as fs from 'fs'; import * as path from 'path'; import { randomUUID } from 'crypto'; import { writeJsonAtomic } from '../../../utils/atomic-ledger'; +import { analytics } from '../../../utils/analytics'; export const TaskStatus = { Pending: 'pending', @@ -75,17 +76,40 @@ export interface EnqueueInput { export const QUEUE_DIR_NAME = '.posthog-wizard'; const DEFAULT_MAX_ATTEMPTS = 2; +/** Every queue transition, in the order it is reflected. */ +export type TransitionEvent = + | 'enqueue' + | 'start' + | 'complete' + | 'skip' + | 'fail' + | 'requeue'; + +export interface QueueStoreOptions { + /** + * Called on every transition with the task's post-transition state. The + * runner uses it for telemetry; the store itself stays analytics-free. + * Listener errors are reported but cannot break a transition. + */ + onTransition?: (event: TransitionEvent, task: QueuedTask) => void; +} + function nowIso(): string { return new Date().toISOString(); } export class QueueStore { private tasks: QueuedTask[] = []; + private readonly onTransition?: ( + event: TransitionEvent, + task: QueuedTask, + ) => void; readonly runId: string; readonly queuePath: string; - constructor(installDir: string, runId: string) { + constructor(installDir: string, runId: string, opts?: QueueStoreOptions) { + this.onTransition = opts?.onTransition; this.runId = runId; const dir = path.join(installDir, QUEUE_DIR_NAME); this.queuePath = path.join(dir, 'queue.json'); @@ -172,6 +196,7 @@ export class QueueStore { }; this.tasks.push(task); this.reflect(); + this.notify('enqueue', task); return task; } @@ -181,6 +206,7 @@ export class QueueStore { t.startedAt = nowIso(); t.attempts += 1; this.reflect(); + this.notify('start', t); return t; } @@ -210,6 +236,7 @@ export class QueueStore { t.startedAt = undefined; t.finishedAt = undefined; this.reflect(); + this.notify('requeue', t); return t; } @@ -225,6 +252,14 @@ export class QueueStore { t.status = status; t.finishedAt = nowIso(); this.reflect(); + this.notify( + status === TaskStatus.Done + ? 'complete' + : status === TaskStatus.Skipped + ? 'skip' + : 'fail', + t, + ); return t; } @@ -237,6 +272,18 @@ export class QueueStore { writeJsonAtomic(this.queuePath, file); } + private notify(event: TransitionEvent, task: QueuedTask): void { + try { + this.onTransition?.(event, task); + } catch (error) { + // A listener must never break a transition, but its failure is a bug. + analytics.captureException( + error instanceof Error ? error : new Error(String(error)), + { step: 'orchestrator_queue_listener', event }, + ); + } + } + private require(id: string): QueuedTask { const t = this.get(id); if (!t) throw new Error(`No task ${id} in the queue`); From 1d8ef519a8ebef70a93063b8ae76750d8af890a0 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:23:18 -0400 Subject: [PATCH 10/12] feat(orchestrator): ci-excluded task types (#639) Co-authored-by: Claude Opus 4.8 (1M context) --- scripts/smoke-test.sh | 26 +++++++------ src/env.ts | 1 + .../__tests__/agent-prompt-loader.test.ts | 14 ++++++- .../agent-prompt-loader.ts | 16 ++++++-- src/lib/agent/agent-runner.ts | 8 ++-- src/lib/agent/mcp-prompt-streaming.ts | 39 +++---------------- .../__tests__/queue-tools.test.ts | 12 ++++++ src/lib/programs/orchestrator/executor.ts | 2 +- .../orchestrator/orchestrator-runner.ts | 4 +- src/lib/programs/orchestrator/queue-tools.ts | 20 +++++++++- src/lib/programs/orchestrator/queue.ts | 18 ++++++--- src/utils/__tests__/ci-flag-overrides.test.ts | 36 ++++++++++++++++- src/utils/ci-flag-overrides.ts | 35 +++++++++++++---- 13 files changed, 160 insertions(+), 71 deletions(-) rename src/lib/{programs/orchestrator => agent}/__tests__/agent-prompt-loader.test.ts (93%) rename src/lib/{programs/orchestrator => agent}/agent-prompt-loader.ts (95%) diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh index 1e9a690b..fcdab7f3 100755 --- a/scripts/smoke-test.sh +++ b/scripts/smoke-test.sh @@ -24,14 +24,16 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH # builds and tsdown strips it; its env var name appearing in dist/*.js means # dead-code elimination regressed and a prod surface leaked. Sourcemaps keep # the original source, so only .js output counts. -OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES' +OVERRIDE_MARKERS='WIZARD_CI_FLAG_OVERRIDES WIZARD_CI_EXCLUDE_TASKS' if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then - # CI builds must keep the path — its absence means the override silently - # stopped working and CI is back to testing live flags. - if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then - echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2 - exit 1 - fi + # CI builds must keep the paths — their absence means the overrides silently + # stopped working and CI is back to testing live behavior. + for marker in $OVERRIDE_MARKERS; do + if ! grep -q "$marker" ./dist/*.js; then + echo "Smoke test failed: CI build is missing the $marker path" >&2 + exit 1 + fi + done # And a real invocation must accept the env var. yargs claims every # POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects # unknown ones during command parse (--version/--help short-circuit and @@ -44,10 +46,12 @@ if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then exit 1 fi else - if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then - echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2 - exit 1 - fi + for marker in $OVERRIDE_MARKERS; do + if grep -q "$marker" ./dist/*.js; then + echo "Smoke test failed: $marker code leaked into a production build" >&2 + exit 1 + fi + done fi # ── 3. --ci rejected in production builds ──────────────────────────────────── diff --git a/src/env.ts b/src/env.ts index c32e886a..4b727441 100644 --- a/src/env.ts +++ b/src/env.ts @@ -43,6 +43,7 @@ type RuntimeEnvKey = // Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD') // would claim it as an unknown CLI option and strict-reject the run. | 'WIZARD_CI_FLAG_OVERRIDES' + | 'WIZARD_CI_EXCLUDE_TASKS' // Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix) | 'POSTHOG_WIZARD_BENCHMARK_CONFIG' | 'POSTHOG_WIZARD_BENCHMARK_FILE' diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts similarity index 93% rename from src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts rename to src/lib/agent/__tests__/agent-prompt-loader.test.ts index 22ce11af..a0a2b04a 100644 --- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts @@ -12,7 +12,7 @@ import { type AgentRegistry, type OrchestratorPromptContext, } from '../agent-prompt-loader'; -import { QueueStore } from '../queue'; +import { QueueStore } from '../../programs/orchestrator/queue'; function tmpDir(): string { return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-')); @@ -131,6 +131,18 @@ describe('buildRegistry', () => { // A flowless prompt (e.g. the documentation example) joins no registry. expect(registry.get('example')).toBeUndefined(); }); + + it('drops harness-excluded types; unrestricted runs keep them', () => { + const prompts = [ + prompt({ type: 'plan', flow: 'f', seed: true }), + prompt({ type: 'build', flow: 'f' }), + prompt({ type: 'dashboard', flow: 'f' }), + ]; + expect( + buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types, + ).toEqual(['build']); + expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']); + }); }); describe('resolveTask', () => { diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts similarity index 95% rename from src/lib/programs/orchestrator/agent-prompt-loader.ts rename to src/lib/agent/agent-prompt-loader.ts index 3212a2c3..1fe487b1 100644 --- a/src/lib/programs/orchestrator/agent-prompt-loader.ts +++ b/src/lib/agent/agent-prompt-loader.ts @@ -15,8 +15,8 @@ * network latency. The registry's type list also drives `enqueue_task` * validation. */ -import type { QueueStore, QueuedTask } from './queue'; -import type { ResolvedTask } from './executor'; +import type { QueueStore, QueuedTask } from '../programs/orchestrator/queue'; +import type { ResolvedTask } from '../programs/orchestrator/executor'; /** * The basics the client injects around every agent-prompt body. The `/agents/` @@ -137,8 +137,15 @@ export interface AgentRegistry { export function buildRegistry( prompts: readonly AgentPrompt[], flow: string, + opts?: { exclude?: readonly string[] }, ): AgentRegistry { - const inFlow = prompts.filter((p) => p.flow === flow); + // The harness can exclude task types (CI excludes dashboards). An excluded + // type does not exist for the run: the seed cannot enqueue it and no agent + // is ever spun up for it. + const excluded = new Set(opts?.exclude ?? []); + const inFlow = prompts.filter( + (p) => p.flow === flow && !excluded.has(p.type), + ); const byType = new Map(inFlow.map((p) => [p.type, p])); return { types: inFlow.filter((p) => !p.seed).map((p) => p.type), @@ -238,6 +245,7 @@ async function fetchText(url: string): Promise { export async function loadAgentRegistry( skillsBaseUrl: string, flow: string, + opts?: { exclude?: readonly string[] }, ): Promise { const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`); const menu = JSON.parse(menuRaw) as AgentMenu; @@ -249,7 +257,7 @@ export async function loadAgentRegistry( }), ); - return buildRegistry(prompts, flow); + return buildRegistry(prompts, flow, opts); } /** diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 2d693058..0f3269c4 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -372,12 +372,12 @@ async function bootstrapProgram( // orchestrator arm overwrites this with its own variant when it forks. analytics.setTag('variant', wizardMetadata.VARIANT); + // One MCP url for every region: the server resolves the user's region from + // the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is + // not needed here. const mcpUrl = session.localMcp ? 'http://localhost:8787/mcp' - : runtimeEnv('MCP_URL') || - (cloudRegion === 'eu' - ? 'https://mcp-eu.posthog.com/mcp' - : 'https://mcp.posthog.com/mcp'); + : runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp'; return { skillsBaseUrl, diff --git a/src/lib/agent/mcp-prompt-streaming.ts b/src/lib/agent/mcp-prompt-streaming.ts index dc8f8ff9..b3655f12 100644 --- a/src/lib/agent/mcp-prompt-streaming.ts +++ b/src/lib/agent/mcp-prompt-streaming.ts @@ -42,38 +42,11 @@ const MODEL = 'claude-sonnet-4-6'; // telemetry on average turn counts per prompt. const MAX_TURNS = 30; -function resolveMcpUrl(host: string): string { - const override = runtimeEnv('MCP_URL'); - if (override) return override; - // Parse the actual hostname rather than substring-matching the raw - // input. `host.includes('eu.posthog.com')` would let arbitrary URLs - // like `https://evil.eu.posthog.com.attacker.com` or - // `https://useu.posthog.commerce` route to the EU MCP endpoint - // (CodeQL: incomplete-url-substring-sanitization). Parsing into a - // hostname and checking exact match / trusted subdomain blocks both. - const hostname = parseHostname(host); - const isEu = - hostname === 'eu.posthog.com' || hostname.endsWith('.eu.posthog.com'); - return isEu - ? 'https://mcp-eu.posthog.com/mcp' - : 'https://mcp.posthog.com/mcp'; -} - -/** - * Normalize a host string into a hostname suitable for trust checks. - * Accepts either a full URL (`https://us.posthog.com`) or a bare host - * (`us.posthog.com`). Returns the hostname lowercased, or the trimmed - * input lowercased if parsing fails (defensive fallback so a malformed - * value still resolves to the safer-default US endpoint). - */ -function parseHostname(raw: string): string { - const trimmed = raw.trim().toLowerCase(); - try { - const withScheme = trimmed.includes('://') ? trimmed : `https://${trimmed}`; - return new URL(withScheme).hostname.toLowerCase(); - } catch { - return trimmed; - } +// One MCP url for every region: the server resolves the user's region from +// the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is +// not needed here. +function resolveMcpUrl(): string { + return runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp'; } /** @@ -245,7 +218,7 @@ export async function* runMcpPromptViaSdk(args: { once: true, }); - const mcpUrl = resolveMcpUrl(credentials.host); + const mcpUrl = resolveMcpUrl(); logToFile( `[runMcpPromptViaSdk] mcpUrl=${mcpUrl} model=${MODEL} resume=${ resumeSessionId ?? '(none)' diff --git a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts index 318825d2..33def856 100644 --- a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts +++ b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts @@ -57,6 +57,18 @@ describe('checkEnqueueGuards', () => { const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' }); expect(r).toEqual({ ok: true }); }); + + it('refuses to grow the queue past the runaway cap', () => { + for (let i = 0; i < 30; i++) { + store.enqueue({ type: 'capture', inputs: { i } }); + } + const r = checkEnqueueGuards(ctx, { + type: 'init', + inputs: { i: 30 }, + reason: 'x', + }); + expect(r).toMatchObject({ ok: false, guard: 'queue-full' }); + }); }); describe('apply functions', () => { diff --git a/src/lib/programs/orchestrator/executor.ts b/src/lib/programs/orchestrator/executor.ts index abf0ed15..d8cfb976 100644 --- a/src/lib/programs/orchestrator/executor.ts +++ b/src/lib/programs/orchestrator/executor.ts @@ -101,7 +101,7 @@ export async function drainQueue( for (;;) { for (const task of store.nextRunnable()) { if (++starts > opts.maxStarts) break; - // runOne marks the task in_progress synchronously, so the next + // runOne marks the task running synchronously, so the next // nextRunnable() call no longer offers it. const p = runOne(store, runTask, task).finally(() => running.delete(task.id), diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index b6129016..31df7372 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -23,6 +23,7 @@ import { detectNodePackageManagers } from '../../detection/package-manager'; import { installSkillById } from '../../wizard-tools'; import { getUI } from '../../../ui'; import { analytics } from '../../../utils/analytics'; +import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides'; import { logToFile } from '../../../utils/debug'; import type { ProgramConfig } from '../program-step'; import type { BootstrapResult } from '../../agent/agent-runner'; @@ -42,7 +43,7 @@ import { resolveTask, taskModel, type OrchestratorPromptContext, -} from './agent-prompt-loader'; +} from '../../agent/agent-prompt-loader'; function toTodoStatus(status: TaskStatus): string { switch (status) { @@ -88,6 +89,7 @@ export async function runOrchestrator( const registry = await loadAgentRegistry( boot.skillsBaseUrl, programConfig.id, + { exclude: ciExcludedTaskTypes() }, ); const seedPrompt = registry.seed; if (!seedPrompt) { diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts index 64e5bc93..5a05cda0 100644 --- a/src/lib/programs/orchestrator/queue-tools.ts +++ b/src/lib/programs/orchestrator/queue-tools.ts @@ -55,10 +55,18 @@ function dedupKey(type: string, inputs: Record): string { return `${type}::${stableStringify(inputs)}`; } +/** + * A backstop on total queue size. Tasks can enqueue tasks, so a misbehaving + * type could grow the queue without bound. Keeping the graph small is the job + * of good agent and skill design, not this number — it only stops a runaway. + * The real flow is ~9 tasks, so this sits well clear of it. + */ +const MAX_QUEUE_TASKS = 30; + /** * Validate an enqueue. Structural checks only — a real type, real dependencies, - * and not a literal duplicate. How much runs, and in what shape, is the task - * graph's business, not a knob's. + * not a literal duplicate, and not past the runaway backstop. How much runs, + * and in what shape, is the task graph's business, not a knob's. */ export function checkEnqueueGuards( ctx: OrchestratorToolsContext, @@ -66,6 +74,14 @@ export function checkEnqueueGuards( ): GuardResult { const tasks = ctx.store.list(); + if (tasks.length >= MAX_QUEUE_TASKS) { + return { + ok: false, + guard: 'queue-full', + message: `The queue already holds ${tasks.length} tasks (cap ${MAX_QUEUE_TASKS}). Refine the existing tasks rather than adding more.`, + }; + } + if (!ctx.validTypes.includes(args.type)) { return { ok: false, diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 302897e6..19545d9d 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -32,6 +32,12 @@ export interface QueuedTask { /** Human-readable label for the TUI, set by the enqueuing agent. */ label?: string; status: TaskStatus; + /** + * Ids of tasks that must finish before this one runs. Ids are generated at + * enqueue and dependsOn is never mutated, so a task can only depend on tasks + * created before it — the graph is a DAG by construction, cycles cannot + * form. Unknown ids are rejected by the enqueue_task guard. + */ dependsOn: string[]; inputs: Record; model?: string; @@ -76,6 +82,10 @@ export interface EnqueueInput { export const QUEUE_DIR_NAME = '.posthog-wizard'; const DEFAULT_MAX_ATTEMPTS = 2; +function nowIso(): string { + return new Date().toISOString(); +} + /** Every queue transition, in the order it is reflected. */ export type TransitionEvent = | 'enqueue' @@ -94,10 +104,6 @@ export interface QueueStoreOptions { onTransition?: (event: TransitionEvent, task: QueuedTask) => void; } -function nowIso(): string { - return new Date().toISOString(); -} - export class QueueStore { private tasks: QueuedTask[] = []; private readonly onTransition?: ( @@ -147,7 +153,7 @@ export class QueueStore { } /** - * True when no task is in progress and none can be started. Either everything + * True when no task is running and none can be started. Either everything * is terminal, or the only pending tasks are blocked by a failed dependency. */ isDrained(): boolean { @@ -229,7 +235,7 @@ export class QueueStore { return this.finish(id, TaskStatus.Failed, handoff); } - /** Put a failed/in-progress task back to pending for a retry within the run. */ + /** Put a failed/running task back to pending for a retry within the run. */ requeue(id: string): QueuedTask { const t = this.require(id); t.status = TaskStatus.Pending; diff --git a/src/utils/__tests__/ci-flag-overrides.test.ts b/src/utils/__tests__/ci-flag-overrides.test.ts index 4d2333a1..4f0d844f 100644 --- a/src/utils/__tests__/ci-flag-overrides.test.ts +++ b/src/utils/__tests__/ci-flag-overrides.test.ts @@ -1,4 +1,7 @@ -import { applyCiFlagOverrides } from '@utils/ci-flag-overrides'; +import { + applyCiFlagOverrides, + ciExcludedTaskTypes, +} from '@utils/ci-flag-overrides'; jest.mock('@utils/debug', () => ({ logToFile: jest.fn(), @@ -61,3 +64,34 @@ describe('applyCiFlagOverrides', () => { }); }); }); + +describe('ciExcludedTaskTypes', () => { + afterEach(() => { + delete process.env.WIZARD_CI_EXCLUDE_TASKS; + }); + + it('is empty when nothing is excluded', () => { + expect(ciExcludedTaskTypes()).toEqual([]); + }); + + it('parses the comma-separated list, ignoring stray whitespace', () => { + process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard, report ,'; + expect(ciExcludedTaskTypes()).toEqual(['dashboard', 'report']); + }); + + it('is inert in production builds', () => { + const prevNodeEnv = process.env.NODE_ENV; + process.env.NODE_ENV = 'production'; + process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard'; + let result: readonly string[] | undefined; + jest.isolateModules(() => { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const prod = require('@utils/ci-flag-overrides') as { + ciExcludedTaskTypes: typeof ciExcludedTaskTypes; + }; + result = prod.ciExcludedTaskTypes(); + }); + process.env.NODE_ENV = prevNodeEnv; + expect(result).toEqual([]); + }); +}); diff --git a/src/utils/ci-flag-overrides.ts b/src/utils/ci-flag-overrides.ts index e8790e23..475060c3 100644 --- a/src/utils/ci-flag-overrides.ts +++ b/src/utils/ci-flag-overrides.ts @@ -3,14 +3,15 @@ * * CI must route deterministically: a run that tests the orchestrator arm says * so explicitly instead of depending on a live feature flag someone can edit - * mid-week. `WIZARD_CI_FLAG_OVERRIDES` is a JSON object of flag key → - * value, merged over whatever PostHog returned. + * mid-week. The override env var (see the allowlist in `env.ts`) is a JSON + * object of flag key → value, merged over whatever PostHog returned. * * The override path exists only in CI builds (`pnpm build:ci`). Published - * builds inline NODE_ENV as the literal "production", the guard below - * collapses, and tsdown strips the rest from the bundle — and the smoke test - * asserts the env var's name is physically absent from production output, so - * this can never quietly become a production surface. + * builds inline NODE_ENV as the literal "production", the guards collapse, + * and tsdown strips the rest from the bundle — and the smoke test asserts the + * env var names are physically absent from production output (which is also + * why no comment in this file may spell them out), so this can never quietly + * become a production surface. */ import { runtimeEnv } from '@env'; import { logToFile } from './debug'; @@ -33,7 +34,7 @@ export function applyCiFlagOverrides( // A malformed override is a CI misconfiguration. Fail the run loudly // rather than silently testing whatever the live flags happen to say. throw new Error( - 'WIZARD_CI_FLAG_OVERRIDES is not valid JSON (expected {"flag-key": value, ...}).', + 'The CI flag-override env var is not valid JSON (expected {"flag-key": value, ...}).', ); } @@ -44,3 +45,23 @@ export function applyCiFlagOverrides( logToFile('[flags] CI overrides applied', overrides); return merged; } + +/** + * Task types excluded from this run. The exclusion env var (see the allowlist + * in `env.ts`) is a comma-separated list (e.g. `dashboard`), set by the CI + * harness that owns the policy — the wizard and the served content stay + * run-mode agnostic. CI-build only, same as the flag overrides: published + * builds strip this path. + */ +export function ciExcludedTaskTypes(): readonly string[] { + if (process.env.NODE_ENV === 'production') return []; + + const raw = runtimeEnv('WIZARD_CI_EXCLUDE_TASKS'); + if (!raw) return []; + const types = raw + .split(',') + .map((t) => t.trim()) + .filter(Boolean); + if (types.length > 0) logToFile('[flags] CI task exclusions', types); + return types; +} From d7077e161d3b40415d9caf67f2915987e219b15a Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:23:28 -0400 Subject: [PATCH 11/12] feat(orchestrator): self-deleting run cache + responsiveness analytics (#677) Co-authored-by: Claude Fable 5 --- .../__tests__/agent-prompt-loader.test.ts | 40 +++++++++ src/lib/agent/agent-prompt-loader.ts | 35 ++++++-- .../orchestrator/__tests__/queue.test.ts | 10 +++ .../__tests__/run-metrics.test.ts | 68 +++++++++++++++ .../orchestrator/orchestrator-runner.ts | 81 +++++++++++++----- src/lib/programs/orchestrator/queue.ts | 19 ++++- src/lib/programs/orchestrator/run-metrics.ts | 85 +++++++++++++++++++ src/ui/tui/primitives/LogViewer.tsx | 6 +- 8 files changed, 311 insertions(+), 33 deletions(-) create mode 100644 src/lib/programs/orchestrator/__tests__/run-metrics.test.ts create mode 100644 src/lib/programs/orchestrator/run-metrics.ts diff --git a/src/lib/agent/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts index a0a2b04a..d2ad5b7a 100644 --- a/src/lib/agent/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts @@ -217,6 +217,46 @@ describe('resolveTask', () => { 'Context from previous steps', ); }); + + it('includes transitive ancestors, not just direct dependencies', () => { + const registry = registryOf([prompt]); + // install -> capture -> (this task). The task depends only on capture, but + // install's context must still reach it so nothing is silently lost. + const install = store.enqueue({ type: 'install' }); + store.complete(install.id, { + goals: 'declare the SDK', + did: 'added posthog to the manifest', + forNextAgent: 'SDK is declared, not yet installed', + }); + const capture = store.enqueue({ type: 'capture', dependsOn: [install.id] }); + store.complete(capture.id, { + goals: 'instrument events', + did: 'added capture calls', + forNextAgent: 'events are in', + }); + const task = store.enqueue({ type: 'capture', dependsOn: [capture.id] }); + const { prompt: out } = resolveTask(registry, task, store); + expect(out).toContain('added posthog to the manifest'); // transitive + expect(out).toContain('added capture calls'); // direct + }); + + it('lists each ancestor once for diamond dependencies', () => { + const registry = registryOf([prompt]); + const install = store.enqueue({ type: 'install' }); + store.complete(install.id, { + goals: 'g', + did: 'manifest entry added', + forNextAgent: 'n', + }); + const a = store.enqueue({ type: 'identify', dependsOn: [install.id] }); + store.complete(a.id, { goals: 'g', did: 'a-did', forNextAgent: 'n' }); + const b = store.enqueue({ type: 'identify', dependsOn: [install.id] }); + store.complete(b.id, { goals: 'g', did: 'b-did', forNextAgent: 'n' }); + // Resolved task must be a registered type (capture); its ancestors need not be. + const task = store.enqueue({ type: 'capture', dependsOn: [a.id, b.id] }); + const { prompt: out } = resolveTask(registry, task, store); + expect(out.match(/manifest entry added/g)).toHaveLength(1); + }); }); describe('taskModel', () => { diff --git a/src/lib/agent/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts index 1fe487b1..8b83ff9a 100644 --- a/src/lib/agent/agent-prompt-loader.ts +++ b/src/lib/agent/agent-prompt-loader.ts @@ -55,7 +55,7 @@ function commandmentsReference(ctx: OrchestratorPromptContext): string | null { return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; } -const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; +const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only inside this project's own directory: never read, list, or search (find, ls, grep, glob) outside it — not the OS, not other projects, not global package caches. If your task seems to need something outside this directory, it does not — skip that part and say so in your handoff rather than hunting across the filesystem. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; @@ -277,14 +277,37 @@ function formatInputValue(value: unknown): string { } /** - * Render the handoffs of a task's completed dependencies into a context section, - * so a fresh agent sees what the upstream steps did. Empty when there are none. + * The ids of every task `task` transitively depends on — the full upstream + * chain, not just direct dependencies — ordered roots-first, each once. A `seen` + * set dedupes diamonds and guards against cycles. + */ +function ancestorIds(task: QueuedTask, store: QueueStore): string[] { + const seen = new Set(); + const ordered: string[] = []; + const visit = (id: string): void => { + if (seen.has(id)) return; + seen.add(id); + const t = store.get(id); + if (!t) return; + for (const dep of t.dependsOn) visit(dep); // ancestors before dependents + ordered.push(id); + }; + for (const dep of task.dependsOn) visit(dep); + return ordered; +} + +/** + * Render the handoffs of every step `task` transitively depends on into a context + * section, so a fresh agent sees the whole upstream chain — not just its direct + * dependencies. Reliability over token economy: a step must never have to + * re-discover what any ancestor already established just because an intermediate + * handoff happened to omit it. Empty when there are no completed ancestors. */ function renderHandoffContext(task: QueuedTask, store: QueueStore): string { const lines: string[] = []; - for (const depId of task.dependsOn) { - const dep = store.get(depId); - const handoff = store.readHandoff(depId); + for (const id of ancestorIds(task, store)) { + const dep = store.get(id); + const handoff = store.readHandoff(id); if (!dep || !handoff) continue; lines.push(`### ${dep.type}`); lines.push(`- did: ${handoff.did}`); diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts index 7f34f283..3b493d0f 100644 --- a/src/lib/programs/orchestrator/__tests__/queue.test.ts +++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts @@ -3,6 +3,7 @@ import * as os from 'os'; import * as path from 'path'; import { QueueStore, + QUEUE_DIR_NAME, type QueueFile, type TaskHandoff, } from '@lib/programs/orchestrator/queue'; @@ -28,6 +29,15 @@ describe('QueueStore', () => { fs.rmSync(dir, { recursive: true, force: true }); }); + it('drops a self-explaining .DELETE-ME.md in the cache folder', () => { + const note = fs.readFileSync( + path.join(dir, QUEUE_DIR_NAME, '.DELETE-ME.md'), + 'utf8', + ); + expect(note).toContain('safely delete'); + expect(note).toContain(`${QUEUE_DIR_NAME}/`); + }); + it('enqueues a pending task with defaults', () => { const t = q.enqueue({ type: 'install' }); expect(t.status).toBe('pending'); diff --git a/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts new file mode 100644 index 00000000..544f17b2 --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts @@ -0,0 +1,68 @@ +import { RunMetrics } from '@lib/programs/orchestrator/run-metrics'; + +describe('RunMetrics', () => { + it('reports time to first start and first completion from run start', () => { + const m = new RunMetrics(0); + m.recordStart(100); + m.recordComplete(300); + m.recordStart(1000); + m.recordComplete(1100); + const s = m.summary(); + expect(s.time_to_first_task_ms).toBe(100); + expect(s.time_to_first_completion_ms).toBe(300); + }); + + it('max_gap_ms is the longest silence across all visible transitions', () => { + const m = new RunMetrics(0); + m.recordStart(100); // visible @100 + m.recordComplete(300); // gap 200 + m.recordStart(1000); // gap 700 ← longest + m.recordComplete(1100); // gap 100 + expect(m.summary().max_gap_ms).toBe(700); + }); + + it('recordStart returns ms_since_run_start and the gap from the previous start', () => { + const m = new RunMetrics(0); + expect(m.recordStart(100)).toEqual({ + ms_since_run_start: 100, + gap_since_prev_start_ms: undefined, + }); + expect(m.recordStart(1000)).toEqual({ + ms_since_run_start: 1000, + gap_since_prev_start_ms: 900, + }); + }); + + it('reports undefined timings for a run with no transitions, not zero', () => { + const s = new RunMetrics(0).summary(); + expect(s.time_to_first_task_ms).toBeUndefined(); + expect(s.time_to_first_completion_ms).toBeUndefined(); + expect(s.max_gap_ms).toBeUndefined(); + }); + + it('a single started-but-unfinished task reports a real zero gap and no completion', () => { + const m = new RunMetrics(0); + m.recordStart(50); + const s = m.summary(); + expect(s.time_to_first_task_ms).toBe(50); + expect(s.time_to_first_completion_ms).toBeUndefined(); + expect(s.max_gap_ms).toBe(0); // one visible transition → genuine 0, not undefined + }); + + it('counts a retry stall (start to re-start) as silence', () => { + const m = new RunMetrics(0); + m.recordStart(0); + // the task ended without reporting and was requeued (invisible), then + // re-started 5s later — that stall is a silence the user sees. + m.recordStart(5000); + expect(m.summary().max_gap_ms).toBe(5000); + }); + + it('treats skip and fail as visible transitions for gap tracking', () => { + const m = new RunMetrics(0); + m.recordStart(0); + m.recordTerminal(2000); // skip or fail, gap 2000 + m.recordStart(2500); // gap 500 + expect(m.summary().max_gap_ms).toBe(2000); + }); +}); diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index 31df7372..6353d8c8 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -20,7 +20,7 @@ import { } from '../../agent/agent-interface'; import { OutroKind, type WizardSession } from '../../wizard-session'; import { detectNodePackageManagers } from '../../detection/package-manager'; -import { installSkillById } from '../../wizard-tools'; +import { installSkillById, fetchSkillMenu } from '../../wizard-tools'; import { getUI } from '../../../ui'; import { analytics } from '../../../utils/analytics'; import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides'; @@ -35,6 +35,7 @@ import { type QueuedTask, } from './queue'; import { drainQueue, type RunTask } from './executor'; +import { RunMetrics } from './run-metrics'; import { agentRunTools, assembleSeedPrompt, @@ -74,6 +75,27 @@ function sessionRunOptions(session: WizardSession): WizardRunOptions { }; } +/** + * The framework reference is the full `integration` skill. `session.skillId` is + * the bare framework (e.g. `django`), but the skill menu ids it as + * `integration-`. Resolve to the menu id: exact `integration-` + * (the 1:1 frameworks — django, python, flask, …), else the first granular variant + * under it (e.g. `integration-nextjs-app-router`). Undefined when none exists. + */ +async function resolveReferenceSkillId( + skillsBaseUrl: string, + framework: string, +): Promise { + const menu = await fetchSkillMenu(skillsBaseUrl); + if (!menu) return undefined; + const ids = Object.values(menu.categories) + .flat() + .map((s) => s.id); + const exact = `integration-${framework}`; + if (ids.includes(exact)) return exact; + return ids.find((id) => id.startsWith(`integration-${framework}-`)); +} + export async function runOrchestrator( session: WizardSession, programConfig: ProgramConfig, @@ -107,8 +129,7 @@ export async function runOrchestrator( // queue transitions, with the resolved model so cheap work is attributable // to cheap models. const runStartMs = Date.now(); - let firstStartMs: number | undefined; - let lastStartMs: number | undefined; + const metrics = new RunMetrics(runStartMs); const durationMs = (t: QueuedTask) => t.startedAt && t.finishedAt ? Date.parse(t.finishedAt) - Date.parse(t.startedAt) @@ -129,31 +150,28 @@ export async function runOrchestrator( dynamic: task.enqueuedBy !== 'orchestrator', }); break; - case 'start': { - const now = Date.now(); + case 'start': analytics.wizardCapture('orchestrator task started', { ...base, - ms_since_run_start: now - runStartMs, - gap_since_prev_start_ms: - lastStartMs === undefined ? undefined : now - lastStartMs, + ...metrics.recordStart(Date.now()), }); - firstStartMs ??= now; - lastStartMs = now; break; - } case 'complete': + metrics.recordComplete(Date.now()); analytics.wizardCapture('orchestrator task completed', { ...base, duration_ms: durationMs(task), }); break; case 'skip': + metrics.recordTerminal(Date.now()); analytics.wizardCapture('orchestrator task skipped', { ...base, duration_ms: durationMs(task), }); break; case 'fail': + metrics.recordTerminal(Date.now()); analytics.wizardCapture('orchestrator task failed', { ...base, duration_ms: durationMs(task), @@ -172,9 +190,12 @@ export async function runOrchestrator( // skill — only the example file is read, when the agent's prompt points at it. let examplePath: string | undefined; let commandmentsPath: string | undefined; - if (session.skillId) { + const referenceSkillId = session.skillId + ? await resolveReferenceSkillId(boot.skillsBaseUrl, session.skillId) + : undefined; + if (referenceSkillId) { const ref = await installSkillById( - session.skillId, + referenceSkillId, session.installDir, boot.skillsBaseUrl, path.join(QUEUE_DIR_NAME, 'reference'), @@ -189,8 +210,14 @@ export async function runOrchestrator( commandmentsPath = commandments; } } else { - logToFile(`[orchestrator] reference example unavailable: ${ref.kind}`); + logToFile( + `[orchestrator] reference unavailable: ${ref.kind} (${referenceSkillId})`, + ); } + } else if (session.skillId) { + logToFile( + `[orchestrator] no integration skill for framework "${session.skillId}"`, + ); } // The client injects the basics (project context + the I/O contract) around @@ -353,11 +380,20 @@ export async function runOrchestrator( try { await drainQueue(store, runTask); } finally { - // Success or failure, the installed task instructions never outlive the run. - rmSync(path.join(session.installDir, taskSkillsRoot), { - recursive: true, - force: true, - }); + // Success or failure, no run artifact outlives the run — wipe the whole + // cache folder (queue, handoffs, reference example, installed task + // instructions). The .DELETE-ME.md inside is the fallback if we don't. + try { + rmSync(path.join(session.installDir, QUEUE_DIR_NAME), { + recursive: true, + force: true, + }); + } catch (err) { + analytics.captureException( + err instanceof Error ? err : new Error(String(err)), + { step: 'orchestrator_cache_cleanup' }, + ); + } } renderQueue(); @@ -372,8 +408,11 @@ export async function runOrchestrator( tasks_failed: summary.failed, tasks_skipped: summary.skipped, total_duration_ms: Date.now() - runStartMs, - time_to_first_task_ms: - firstStartMs === undefined ? undefined : firstStartMs - runStartMs, + ...metrics.summary(), + dynamic_enqueue_count: store + .list() + .filter((t) => t.enqueuedBy !== 'orchestrator').length, + retried_task_count: store.list().filter((t) => t.attempts > 1).length, }); // The build step flags any unresolved conflict in its handoff; surface the diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 19545d9d..4aa7c368 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -6,9 +6,10 @@ * returns every pending task whose dependencies are satisfied, and how many of * those run at once is decided by the task graph, not the queue. * - * Every transition rewrites `/.posthog-wizard/queue.json`, a small - * file holding the whole queue, handoffs included. Today it is the run's - * log and the report's source; later it is the resume point. + * Every transition rewrites `/.posthog-wizard-cache/queue.json`, a + * small file holding the whole queue, handoffs included. It is the run's log + * and the report's source. The whole cache folder is run-scoped and wiped when + * the run ends. */ import * as fs from 'fs'; import * as path from 'path'; @@ -79,13 +80,22 @@ export interface EnqueueInput { enqueuedBy?: string; } -export const QUEUE_DIR_NAME = '.posthog-wizard'; +export const QUEUE_DIR_NAME = '.posthog-wizard-cache'; const DEFAULT_MAX_ATTEMPTS = 2; function nowIso(): string { return new Date().toISOString(); } +/** Dropped in the cache folder so an orphaned copy explains itself. */ +const DELETE_ME_FILE = '.DELETE-ME.md'; +const DELETE_ME_BODY = `# Safe to delete + +This folder contains run artifacts from the PostHog Wizard. This should have +been deleted if the Wizard has finished running. If this wasn't deleted for +some reason, you can safely delete the entire \`${QUEUE_DIR_NAME}/\` folder. +`; + /** Every queue transition, in the order it is reflected. */ export type TransitionEvent = | 'enqueue' @@ -120,6 +130,7 @@ export class QueueStore { const dir = path.join(installDir, QUEUE_DIR_NAME); this.queuePath = path.join(dir, 'queue.json'); fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(path.join(dir, DELETE_ME_FILE), DELETE_ME_BODY); } // ── Reads ─────────────────────────────────────────────────────────── diff --git a/src/lib/programs/orchestrator/run-metrics.ts b/src/lib/programs/orchestrator/run-metrics.ts new file mode 100644 index 00000000..8ea82415 --- /dev/null +++ b/src/lib/programs/orchestrator/run-metrics.ts @@ -0,0 +1,85 @@ +/** + * Responsiveness metrics for an orchestrator run. + * + * Responsiveness is the experiment's headline: how quickly the user sees the + * first progress, and whether progress stays steady (no long silences). The math + * is accumulated from queue transitions but kept here, pure and time-injected, so + * it is unit-testable away from the runner. Wall-clock times are passed in as + * milliseconds; the caller owns the clock. + */ + +export interface RunMetricsSummary { + /** Run start → first task started. */ + time_to_first_task_ms?: number; + /** Run start → first task completed (the first visible "done"). */ + time_to_first_completion_ms?: number; + /** Longest silence between two consecutive user-visible transitions. */ + max_gap_ms?: number; +} + +/** The per-event timing the `orchestrator task started` event reports. */ +export interface StartTiming { + ms_since_run_start: number; + gap_since_prev_start_ms?: number; +} + +export class RunMetrics { + private firstStartMs?: number; + private lastStartMs?: number; + private firstCompleteMs?: number; + private lastVisibleMs?: number; + private maxGapMs = 0; + + constructor(private readonly runStartMs: number) {} + + /** A task started. Returns the per-start-event timing for the start event. */ + recordStart(nowMs: number): StartTiming { + const timing: StartTiming = { + ms_since_run_start: nowMs - this.runStartMs, + gap_since_prev_start_ms: + this.lastStartMs === undefined ? undefined : nowMs - this.lastStartMs, + }; + this.firstStartMs ??= nowMs; + this.lastStartMs = nowMs; + this.markVisible(nowMs); + return timing; + } + + /** A task completed. */ + recordComplete(nowMs: number): void { + this.firstCompleteMs ??= nowMs; + this.markVisible(nowMs); + } + + /** A task reached a terminal non-complete state the user sees (skip/fail). */ + recordTerminal(nowMs: number): void { + this.markVisible(nowMs); + } + + /** + * The run-level responsiveness summary. Timings are `undefined` when the + * relevant transition never happened (e.g. a run that started no task), so a + * no-task run stays distinguishable from a genuine zero. + */ + summary(): RunMetricsSummary { + return { + time_to_first_task_ms: + this.firstStartMs === undefined + ? undefined + : this.firstStartMs - this.runStartMs, + time_to_first_completion_ms: + this.firstCompleteMs === undefined + ? undefined + : this.firstCompleteMs - this.runStartMs, + max_gap_ms: this.lastVisibleMs === undefined ? undefined : this.maxGapMs, + }; + } + + /** requeue is not user-visible, so a retry stall counts as silence here. */ + private markVisible(nowMs: number): void { + if (this.lastVisibleMs !== undefined) { + this.maxGapMs = Math.max(this.maxGapMs, nowMs - this.lastVisibleMs); + } + this.lastVisibleMs = nowMs; + } +} diff --git a/src/ui/tui/primitives/LogViewer.tsx b/src/ui/tui/primitives/LogViewer.tsx index 9fa02cd4..8277802c 100644 --- a/src/ui/tui/primitives/LogViewer.tsx +++ b/src/ui/tui/primitives/LogViewer.tsx @@ -15,8 +15,10 @@ import { useState, useEffect } from 'react'; import * as fs from 'fs'; import { useStdoutDimensions } from '@ui/tui/hooks/useStdoutDimensions'; -/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar + tab bar */ -const CHROME_ROWS = 8; +/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar + + * tab bar, with a couple rows of headroom so the tail never crowds the status + * bar below it. */ +const CHROME_ROWS = 10; /** Bytes read from the end of the log per refresh — large enough to contain * any practical visible window of lines, small enough to allocate cheaply. */ From b2842fab853ef9151db7fe85402969f944e8781d Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" <29069505+gewenyu99@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:23:42 -0400 Subject: [PATCH 12/12] feat(agent-runner): don't abort non-interactive runs on a health-check outage (#678) Co-authored-by: Claude Fable 5 --- src/lib/agent/agent-runner.ts | 18 ++++++++++++------ src/ui/logging-ui.ts | 6 ++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 0f3269c4..b202a3d0 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -55,6 +55,7 @@ import { } from '@utils/debug'; import { createBenchmarkPipeline } from '@lib/middleware/benchmark'; import { wizardAbort, WizardError, registerCleanup } from '@utils/wizard-abort'; +import { isNonInteractiveEnvironment } from '@utils/environment'; import { formatScanReport, writeScanReport } from '@lib/yara-hooks'; import { detectNodePackageManagers } from '@lib/detection/package-manager'; import type { PackageManagerDetector } from '@lib/detection/package-manager'; @@ -279,12 +280,17 @@ async function bootstrapProgram( await getUI().showBlockingOutage(readiness); - await wizardAbort({ - message: - 'Cannot start — external services are down:\n' + - blockingLabels.map((l) => ` - ${l}`).join('\n') + - '\n\nPlease try again later.', - }); + // The TUI lets the user continue past an outage; non-interactive runs + // (CI) do the same automatically — the degraded services are reported + // above, but we proceed rather than aborting on a transient upstream blip. + if (!isNonInteractiveEnvironment()) { + await wizardAbort({ + message: + 'Cannot start — external services are down:\n' + + blockingLabels.map((l) => ` - ${l}`).join('\n') + + '\n\nPlease try again later.', + }); + } } else if (readiness.decision === WizardReadiness.YesWithWarnings) { getUI().setReadinessWarnings(readiness); } diff --git a/src/ui/logging-ui.ts b/src/ui/logging-ui.ts index 9ae0a2ee..40a32523 100644 --- a/src/ui/logging-ui.ts +++ b/src/ui/logging-ui.ts @@ -114,7 +114,7 @@ export class LoggingUI implements WizardUI { } showBlockingOutage(result: WizardReadinessResult): Promise { - console.log(`▲ Service health issues detected — blocking outage.`); + console.log(`▲ Service health issues detected.`); const blockingKeys = getBlockingServiceKeys(result.health); if (blockingKeys.length > 0) { console.log(`│`); @@ -131,7 +131,9 @@ export class LoggingUI implements WizardUI { for (const reason of result.reasons) { console.log(`│ ${reason}`); } - console.log(`│ The wizard cannot start while these services are down.`); + console.log( + `│ Continuing anyway — health checks are advisory in non-interactive runs.`, + ); return Promise.resolve(); }