From 19c211277982d1a50d49c8fde559c1e539030f2c Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Wed, 10 Jun 2026 17:00:28 -0400 Subject: [PATCH 1/3] feat(orchestrator): harness-excluded task types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI harness can exclude task types from a run via a CI-build-only env var (a comma-separated list, e.g. dashboard — CI runs must not create dashboards). Excluded types do not exist for the run: the seed cannot enqueue them and no agent is spun up. The policy lives with the harness that owns it; the wizard and the served content stay run-mode agnostic. Published builds strip the path, smoke-enforced alongside the flag overrides. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/smoke-test.sh | 26 ++++++++------ src/env.ts | 1 + .../__tests__/agent-prompt-loader.test.ts | 12 +++++++ .../orchestrator/agent-prompt-loader.ts | 12 +++++-- .../orchestrator/orchestrator-runner.ts | 2 ++ src/utils/__tests__/ci-flag-overrides.test.ts | 36 ++++++++++++++++++- src/utils/ci-flag-overrides.ts | 35 ++++++++++++++---- 7 files changed, 103 insertions(+), 21 deletions(-) diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh index 1e9a690b..fcdab7f3 100755 --- a/scripts/smoke-test.sh +++ b/scripts/smoke-test.sh @@ -24,14 +24,16 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH # builds and tsdown strips it; its env var name appearing in dist/*.js means # dead-code elimination regressed and a prod surface leaked. Sourcemaps keep # the original source, so only .js output counts. -OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES' +OVERRIDE_MARKERS='WIZARD_CI_FLAG_OVERRIDES WIZARD_CI_EXCLUDE_TASKS' if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then - # CI builds must keep the path — its absence means the override silently - # stopped working and CI is back to testing live flags. - if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then - echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2 - exit 1 - fi + # CI builds must keep the paths — their absence means the overrides silently + # stopped working and CI is back to testing live behavior. + for marker in $OVERRIDE_MARKERS; do + if ! grep -q "$marker" ./dist/*.js; then + echo "Smoke test failed: CI build is missing the $marker path" >&2 + exit 1 + fi + done # And a real invocation must accept the env var. yargs claims every # POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects # unknown ones during command parse (--version/--help short-circuit and @@ -44,10 +46,12 @@ if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then exit 1 fi else - if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then - echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2 - exit 1 - fi + for marker in $OVERRIDE_MARKERS; do + if grep -q "$marker" ./dist/*.js; then + echo "Smoke test failed: $marker code leaked into a production build" >&2 + exit 1 + fi + done fi # ── 3. --ci rejected in production builds ──────────────────────────────────── diff --git a/src/env.ts b/src/env.ts index c32e886a..4b727441 100644 --- a/src/env.ts +++ b/src/env.ts @@ -43,6 +43,7 @@ type RuntimeEnvKey = // Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD') // would claim it as an unknown CLI option and strict-reject the run. | 'WIZARD_CI_FLAG_OVERRIDES' + | 'WIZARD_CI_EXCLUDE_TASKS' // Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix) | 'POSTHOG_WIZARD_BENCHMARK_CONFIG' | 'POSTHOG_WIZARD_BENCHMARK_FILE' diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts index 22ce11af..0d1c4966 100644 --- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts @@ -131,6 +131,18 @@ describe('buildRegistry', () => { // A flowless prompt (e.g. the documentation example) joins no registry. expect(registry.get('example')).toBeUndefined(); }); + + it('drops harness-excluded types; unrestricted runs keep them', () => { + const prompts = [ + prompt({ type: 'plan', flow: 'f', seed: true }), + prompt({ type: 'build', flow: 'f' }), + prompt({ type: 'dashboard', flow: 'f' }), + ]; + expect( + buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types, + ).toEqual(['build']); + expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']); + }); }); describe('resolveTask', () => { diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts index 3212a2c3..76870283 100644 --- a/src/lib/programs/orchestrator/agent-prompt-loader.ts +++ b/src/lib/programs/orchestrator/agent-prompt-loader.ts @@ -137,8 +137,15 @@ export interface AgentRegistry { export function buildRegistry( prompts: readonly AgentPrompt[], flow: string, + opts?: { exclude?: readonly string[] }, ): AgentRegistry { - const inFlow = prompts.filter((p) => p.flow === flow); + // The harness can exclude task types (CI excludes dashboards). An excluded + // type does not exist for the run: the seed cannot enqueue it and no agent + // is ever spun up for it. + const excluded = new Set(opts?.exclude ?? []); + const inFlow = prompts.filter( + (p) => p.flow === flow && !excluded.has(p.type), + ); const byType = new Map(inFlow.map((p) => [p.type, p])); return { types: inFlow.filter((p) => !p.seed).map((p) => p.type), @@ -238,6 +245,7 @@ async function fetchText(url: string): Promise { export async function loadAgentRegistry( skillsBaseUrl: string, flow: string, + opts?: { exclude?: readonly string[] }, ): Promise { const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`); const menu = JSON.parse(menuRaw) as AgentMenu; @@ -249,7 +257,7 @@ export async function loadAgentRegistry( }), ); - return buildRegistry(prompts, flow); + return buildRegistry(prompts, flow, opts); } /** diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index b6129016..c16c905c 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -23,6 +23,7 @@ import { detectNodePackageManagers } from '../../detection/package-manager'; import { installSkillById } from '../../wizard-tools'; import { getUI } from '../../../ui'; import { analytics } from '../../../utils/analytics'; +import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides'; import { logToFile } from '../../../utils/debug'; import type { ProgramConfig } from '../program-step'; import type { BootstrapResult } from '../../agent/agent-runner'; @@ -88,6 +89,7 @@ export async function runOrchestrator( const registry = await loadAgentRegistry( boot.skillsBaseUrl, programConfig.id, + { exclude: ciExcludedTaskTypes() }, ); const seedPrompt = registry.seed; if (!seedPrompt) { diff --git a/src/utils/__tests__/ci-flag-overrides.test.ts b/src/utils/__tests__/ci-flag-overrides.test.ts index 4d2333a1..4f0d844f 100644 --- a/src/utils/__tests__/ci-flag-overrides.test.ts +++ b/src/utils/__tests__/ci-flag-overrides.test.ts @@ -1,4 +1,7 @@ -import { applyCiFlagOverrides } from '@utils/ci-flag-overrides'; +import { + applyCiFlagOverrides, + ciExcludedTaskTypes, +} from '@utils/ci-flag-overrides'; jest.mock('@utils/debug', () => ({ logToFile: jest.fn(), @@ -61,3 +64,34 @@ describe('applyCiFlagOverrides', () => { }); }); }); + +describe('ciExcludedTaskTypes', () => { + afterEach(() => { + delete process.env.WIZARD_CI_EXCLUDE_TASKS; + }); + + it('is empty when nothing is excluded', () => { + expect(ciExcludedTaskTypes()).toEqual([]); + }); + + it('parses the comma-separated list, ignoring stray whitespace', () => { + process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard, report ,'; + expect(ciExcludedTaskTypes()).toEqual(['dashboard', 'report']); + }); + + it('is inert in production builds', () => { + const prevNodeEnv = process.env.NODE_ENV; + process.env.NODE_ENV = 'production'; + process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard'; + let result: readonly string[] | undefined; + jest.isolateModules(() => { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const prod = require('@utils/ci-flag-overrides') as { + ciExcludedTaskTypes: typeof ciExcludedTaskTypes; + }; + result = prod.ciExcludedTaskTypes(); + }); + process.env.NODE_ENV = prevNodeEnv; + expect(result).toEqual([]); + }); +}); diff --git a/src/utils/ci-flag-overrides.ts b/src/utils/ci-flag-overrides.ts index e8790e23..475060c3 100644 --- a/src/utils/ci-flag-overrides.ts +++ b/src/utils/ci-flag-overrides.ts @@ -3,14 +3,15 @@ * * CI must route deterministically: a run that tests the orchestrator arm says * so explicitly instead of depending on a live feature flag someone can edit - * mid-week. `WIZARD_CI_FLAG_OVERRIDES` is a JSON object of flag key → - * value, merged over whatever PostHog returned. + * mid-week. The override env var (see the allowlist in `env.ts`) is a JSON + * object of flag key → value, merged over whatever PostHog returned. * * The override path exists only in CI builds (`pnpm build:ci`). Published - * builds inline NODE_ENV as the literal "production", the guard below - * collapses, and tsdown strips the rest from the bundle — and the smoke test - * asserts the env var's name is physically absent from production output, so - * this can never quietly become a production surface. + * builds inline NODE_ENV as the literal "production", the guards collapse, + * and tsdown strips the rest from the bundle — and the smoke test asserts the + * env var names are physically absent from production output (which is also + * why no comment in this file may spell them out), so this can never quietly + * become a production surface. */ import { runtimeEnv } from '@env'; import { logToFile } from './debug'; @@ -33,7 +34,7 @@ export function applyCiFlagOverrides( // A malformed override is a CI misconfiguration. Fail the run loudly // rather than silently testing whatever the live flags happen to say. throw new Error( - 'WIZARD_CI_FLAG_OVERRIDES is not valid JSON (expected {"flag-key": value, ...}).', + 'The CI flag-override env var is not valid JSON (expected {"flag-key": value, ...}).', ); } @@ -44,3 +45,23 @@ export function applyCiFlagOverrides( logToFile('[flags] CI overrides applied', overrides); return merged; } + +/** + * Task types excluded from this run. The exclusion env var (see the allowlist + * in `env.ts`) is a comma-separated list (e.g. `dashboard`), set by the CI + * harness that owns the policy — the wizard and the served content stay + * run-mode agnostic. CI-build only, same as the flag overrides: published + * builds strip this path. + */ +export function ciExcludedTaskTypes(): readonly string[] { + if (process.env.NODE_ENV === 'production') return []; + + const raw = runtimeEnv('WIZARD_CI_EXCLUDE_TASKS'); + if (!raw) return []; + const types = raw + .split(',') + .map((t) => t.trim()) + .filter(Boolean); + if (types.length > 0) logToFile('[flags] CI task exclusions', types); + return types; +} From f9c67b14cf797b18c1c4cbfa316158880f9ff96b Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Tue, 16 Jun 2026 15:34:19 -0400 Subject: [PATCH 2/3] fix(agent): drop EU MCP endpoint, resolve region from the bearer token The server reads the user's region from the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is no longer needed. Removes the host-parsing branch in both the orchestrator bootstrap and the linear MCP runner. Co-Authored-By: Claude Fable 5 --- .../__tests__/agent-prompt-loader.test.ts | 259 ++++++++++++++ src/lib/agent/agent-prompt-loader.ts | 336 ++++++++++++++++++ src/lib/agent/agent-runner.ts | 8 +- src/lib/agent/mcp-prompt-streaming.ts | 39 +- 4 files changed, 605 insertions(+), 37 deletions(-) create mode 100644 src/lib/agent/__tests__/agent-prompt-loader.test.ts create mode 100644 src/lib/agent/agent-prompt-loader.ts diff --git a/src/lib/agent/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts new file mode 100644 index 00000000..a0a2b04a --- /dev/null +++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts @@ -0,0 +1,259 @@ +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { + agentRunTools, + assembleTaskPrompt, + buildRegistry, + parseAgentPrompt, + resolveTask, + taskModel, + type AgentPrompt, + type AgentRegistry, + type OrchestratorPromptContext, +} from '../agent-prompt-loader'; +import { QueueStore } from '../../programs/orchestrator/queue'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-')); +} + +function registryOf(prompts: AgentPrompt[]): AgentRegistry { + return buildRegistry( + prompts.map((p) => ({ ...p, flow: 'test-flow' })), + 'test-flow', + ); +} + +describe('parseAgentPrompt', () => { + const sample = `--- +type: instrument-events +model: claude-sonnet-4-6 # cheapest model that succeeds +skills: [instrument-events] +allowedTools: [Read, Edit, Grep, Glob, Bash] +disallowedTools: [enqueue_task] +dependsOn: [init] +--- + +## Goal +Add at least one capture call. +`; + + it('parses frontmatter scalars and inline arrays', () => { + const p = parseAgentPrompt(sample, 'fallback'); + expect(p.type).toBe('instrument-events'); + expect(p.model).toBe('claude-sonnet-4-6'); + expect(p.skills).toEqual(['instrument-events']); + expect(p.allowedTools).toEqual(['Read', 'Edit', 'Grep', 'Glob', 'Bash']); + expect(p.disallowedTools).toEqual(['enqueue_task']); + expect(p.dependsOn).toEqual(['init']); + }); + + it('strips inline comments and keeps the body', () => { + const p = parseAgentPrompt(sample, 'fallback'); + expect(p.model).not.toContain('#'); + expect(p.body).toContain('## Goal'); + expect(p.body).not.toContain('---'); + }); + + it('falls back to the menu id when type is omitted', () => { + const p = parseAgentPrompt('---\nmodel: x\n---\nbody', 'install'); + expect(p.type).toBe('install'); + }); + + it('parses the flow from frontmatter', () => { + const p = parseAgentPrompt('---\nflow: audit\n---\nx', 'fix-events'); + expect(p.flow).toBe('audit'); + }); + + it('marks the seed from frontmatter; everything else is a task', () => { + expect(parseAgentPrompt('---\nseed: true\n---\nplan', 'planner').seed).toBe( + true, + ); + expect(parseAgentPrompt('---\nmodel: x\n---\nbody', 'install').seed).toBe( + false, + ); + }); + + it('defaults missing array fields to empty and model to undefined', () => { + const p = parseAgentPrompt('no frontmatter at all', 'stub'); + expect(p.model).toBeUndefined(); + expect(p.skills).toEqual([]); + expect(p.dependsOn).toEqual([]); + expect(p.body).toBe('no frontmatter at all'); + }); +}); + +describe('agentRunTools', () => { + it('MCP-qualifies orchestrator tools and passes native tools through', () => { + const p = parseAgentPrompt( + '---\nallowedTools: [Read, read_handoffs]\ndisallowedTools: [enqueue_task, complete_task, Bash]\n---\nx', + 't', + ); + const { allowedTools, disallowedTools } = agentRunTools(p); + expect(allowedTools).toEqual([ + 'Read', + 'mcp__posthog-wizard__read_handoffs', + ]); + expect(disallowedTools).toEqual([ + 'mcp__posthog-wizard__enqueue_task', + 'mcp__posthog-wizard__complete_task', + 'Bash', + ]); + }); +}); + +describe('buildRegistry', () => { + const prompt = (over: Partial): AgentPrompt => ({ + type: 'x', + seed: false, + skills: [], + allowedTools: [], + disallowedTools: [], + dependsOn: [], + body: 'b', + ...over, + }); + + it('scopes to one flow and keeps the seed out of the task types', () => { + const registry = buildRegistry( + [ + prompt({ type: 'plan-audit', flow: 'audit', seed: true }), + prompt({ type: 'fix-events', flow: 'audit' }), + prompt({ type: 'install', flow: 'posthog-integration' }), + prompt({ type: 'example' }), + ], + 'audit', + ); + expect(registry.types).toEqual(['fix-events']); + expect(registry.seed?.type).toBe('plan-audit'); + expect(registry.get('install')).toBeUndefined(); + // A flowless prompt (e.g. the documentation example) joins no registry. + expect(registry.get('example')).toBeUndefined(); + }); + + it('drops harness-excluded types; unrestricted runs keep them', () => { + const prompts = [ + prompt({ type: 'plan', flow: 'f', seed: true }), + prompt({ type: 'build', flow: 'f' }), + prompt({ type: 'dashboard', flow: 'f' }), + ]; + expect( + buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types, + ).toEqual(['build']); + expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']); + }); +}); + +describe('resolveTask', () => { + let dir: string; + let store: QueueStore; + + beforeEach(() => { + dir = tmpDir(); + store = new QueueStore(dir, 'run-1'); + }); + + afterEach(() => { + fs.rmSync(dir, { recursive: true, force: true }); + }); + + const prompt: AgentPrompt = { + type: 'capture', + seed: false, + model: 'claude-haiku-4-5-20251001', + skills: ['instrument-events'], + allowedTools: ['Read', 'Edit'], + disallowedTools: ['enqueue_task'], + dependsOn: ['plan-capture'], + body: '## Goal\nInstrument the planned events.', + }; + + it('throws when no prompt is registered for the type', () => { + const registry = registryOf([]); + const task = { type: 'capture', dependsOn: [] } as never; + expect(() => resolveTask(registry, task, store)).toThrow(/capture/); + }); + + it('resolves model, tools, and skills from the prompt', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture' }); + const resolved = resolveTask(registry, task, store); + expect(resolved.model).toBe('claude-haiku-4-5-20251001'); + expect(resolved.skills).toEqual(['instrument-events']); + expect(resolved.disallowedTools).toEqual([ + 'mcp__posthog-wizard__enqueue_task', + ]); + }); + + it('prefers the enqueue model override over the prompt model', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture', model: 'override-x' }); + expect(resolveTask(registry, task, store).model).toBe('override-x'); + }); + + it("appends upstream dependencies' handoffs as context", () => { + const registry = registryOf([prompt]); + const dep = store.enqueue({ type: 'plan-capture' }); + store.complete(dep.id, { + goals: 'decide events', + did: 'picked signup and purchase', + forNextAgent: 'instrument those two', + }); + const task = store.enqueue({ + type: 'capture', + dependsOn: [dep.id], + }); + const resolved = resolveTask(registry, task, store); + expect(resolved.prompt).toContain('Context from previous steps'); + expect(resolved.prompt).toContain('picked signup and purchase'); + expect(resolved.prompt).toContain('instrument those two'); + }); + + it('omits the context section when there are no handoffs', () => { + const registry = registryOf([prompt]); + const task = store.enqueue({ type: 'capture' }); + expect(resolveTask(registry, task, store).prompt).not.toContain( + 'Context from previous steps', + ); + }); +}); + +describe('taskModel', () => { + const prompt = parseAgentPrompt( + '---\nmodel: prompt-model\n---\nx', + 'capture', + ); + + it('prefers the enqueue override, then the prompt, then the default', () => { + const registry = registryOf([prompt]); + const task = { type: 'capture' }; + expect(taskModel(registry, { ...task, model: 'override' } as never)).toBe( + 'override', + ); + expect(taskModel(registry, task as never)).toBe('prompt-model'); + expect(taskModel(registryOf([]), task as never)).toBe('claude-sonnet-4-6'); + }); +}); + +describe('assembleTaskPrompt', () => { + const ctx: OrchestratorPromptContext = { + projectId: 1, + projectApiKey: 'phc_x', + host: 'https://us.posthog.com', + }; + + it('points the agent at its installed task instructions', () => { + const assembled = assembleTaskPrompt(ctx, 'do the task', [ + '.posthog-wizard/skills/capture/SKILL.md', + ]); + expect(assembled).toContain('.posthog-wizard/skills/capture/SKILL.md'); + expect(assembled).toContain('do the task'); + }); + + it('omits the instructions section when no skills are installed', () => { + expect(assembleTaskPrompt(ctx, 'do the task')).not.toContain( + 'task instructions', + ); + }); +}); diff --git a/src/lib/agent/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts new file mode 100644 index 00000000..1fe487b1 --- /dev/null +++ b/src/lib/agent/agent-prompt-loader.ts @@ -0,0 +1,336 @@ +/** + * Agent-prompt loader + registry. + * + * Agent prompts are the WHAT of a task: a markdown file per type, served from + * context-mill as the `agents` content type (parallel to skills). The frontmatter + * carries the artifacts the executor needs — model, the mini-skills to load (the + * HOW), the tools the task may use, and its dependencies — and the body is the + * instruction the agent reads. + * + * The registry is fetched once at startup and scoped to one flow — agents + * declare `flow` and (for the planner) `seed: true` in frontmatter, so each + * program (integration, audit, migration, ...) ships its own agent set and the + * loader stays generic. Every prompt is downloaded and parsed up front, so + * resolving a task to its run config is synchronous and adds no mid-drain + * network latency. The registry's type list also drives `enqueue_task` + * validation. + */ +import type { QueueStore, QueuedTask } from '../programs/orchestrator/queue'; +import type { ResolvedTask } from '../programs/orchestrator/executor'; + +/** + * The basics the client injects around every agent-prompt body. The `/agents/` + * files carry intent only (goal, success criteria); the wizard owns the I/O + * contract — who the agent is, how it reports, how it surfaces progress — so the + * authored prompts never restate it. + */ +export interface OrchestratorPromptContext { + projectId: number; + projectApiKey: string; + host: string; + /** Path to the framework's reference implementation (EXAMPLE.md), if available. */ + examplePath?: string; + /** Path to the framework's rules (COMMANDMENTS.md), if available. */ + commandmentsPath?: string; +} + +function projectContext(ctx: OrchestratorPromptContext): string { + return `You have access to the PostHog MCP server and the wizard tools. + +Project context: +- PostHog Project ID: ${ctx.projectId} +- PostHog public token: ${ctx.projectApiKey} +- PostHog Host: ${ctx.host}`; +} + +/** Points the agent at the framework's reference integration to learn patterns from. */ +function exampleReference(ctx: OrchestratorPromptContext): string | null { + if (!ctx.examplePath) return null; + return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`; +} + +/** The framework's rules ship with the reference skill; every task follows them. */ +function commandmentsReference(ctx: OrchestratorPromptContext): string | null { + if (!ctx.commandmentsPath) return null; + return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; +} + +const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; + +const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; + +/** + * Points the agent at its installed task instructions (the HOW). They live under + * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load + * them — the prompt has to name them. + */ +function skillReference(paths: readonly string[]): string | null { + if (paths.length === 0) return null; + const list = paths.map((p) => `\`${p}\``).join(', '); + return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`; +} + +/** A task agent's full prompt: injected basics, then the authored intent. */ +export function assembleTaskPrompt( + ctx: OrchestratorPromptContext, + body: string, + skillPaths: readonly string[] = [], +): string { + return [ + projectContext(ctx), + exampleReference(ctx), + commandmentsReference(ctx), + skillReference(skillPaths), + TASK_BASICS, + body, + ] + .filter(Boolean) + .join('\n\n'); +} + +/** The seed agent's full prompt: injected basics, then the authored intent. */ +export function assembleSeedPrompt( + ctx: OrchestratorPromptContext, + body: string, +): string { + return [projectContext(ctx), SEED_BASICS, body].join('\n\n'); +} + +/** Used when neither the enqueue call nor the prompt frontmatter names a model. */ +const DEFAULT_TASK_MODEL = 'claude-sonnet-4-6'; + +/** Orchestrator tools are MCP tools under the `posthog-wizard` server. Frontmatter + * names them short (e.g. `enqueue_task`); the SDK gates on the full name. */ +const ORCHESTRATOR_TOOL_PREFIX = 'mcp__posthog-wizard__'; +const ORCHESTRATOR_TOOLS = new Set([ + 'enqueue_task', + 'complete_task', + 'read_handoffs', +]); + +/** A parsed agent prompt. The frontmatter fields plus the markdown body. */ +export interface AgentPrompt { + type: string; + /** Human-readable title for the TUI; falls back to `type` when absent. */ + label?: string; + /** The flow this agent belongs to (the program id, e.g. \`posthog-integration\`). */ + flow?: string; + /** Marks the flow's planner: it seeds the queue and is not an enqueueable task. */ + seed: boolean; + model?: string; + skills: string[]; + allowedTools: string[]; + disallowedTools: string[]; + dependsOn: string[]; + body: string; +} + +export interface AgentRegistry { + /** The flow's enqueueable task types — every prompt except the seed. */ + readonly types: string[]; + /** The flow's planner, the one prompt marked `seed: true` in its frontmatter. */ + readonly seed?: AgentPrompt; + get(type: string): AgentPrompt | undefined; +} + +/** The registry for one flow's prompts. Pure; the loader feeds it the fetched set. */ +export function buildRegistry( + prompts: readonly AgentPrompt[], + flow: string, + opts?: { exclude?: readonly string[] }, +): AgentRegistry { + // The harness can exclude task types (CI excludes dashboards). An excluded + // type does not exist for the run: the seed cannot enqueue it and no agent + // is ever spun up for it. + const excluded = new Set(opts?.exclude ?? []); + const inFlow = prompts.filter( + (p) => p.flow === flow && !excluded.has(p.type), + ); + const byType = new Map(inFlow.map((p) => [p.type, p])); + return { + types: inFlow.filter((p) => !p.seed).map((p) => p.type), + seed: inFlow.find((p) => p.seed), + get: (type) => byType.get(type), + }; +} + +interface AgentMenu { + agents: { id: string; downloadUrl: string }[]; +} + +/** A native tool passes through; an orchestrator tool gets its MCP-qualified name. */ +function expandToolName(name: string): string { + return ORCHESTRATOR_TOOLS.has(name) + ? `${ORCHESTRATOR_TOOL_PREFIX}${name}` + : name; +} + +/** A prompt's allow/disallow lists with orchestrator tool names MCP-qualified. */ +export function agentRunTools(prompt: AgentPrompt): { + allowedTools: string[]; + disallowedTools: string[]; +} { + return { + allowedTools: prompt.allowedTools.map(expandToolName), + disallowedTools: prompt.disallowedTools.map(expandToolName), + }; +} + +function toStringArray(value: unknown): string[] { + if (!Array.isArray(value)) return []; + return value.filter((v): v is string => typeof v === 'string'); +} + +/** + * Parse the leading `---` frontmatter block and the markdown body. The + * frontmatter is a small, known schema (scalars and inline `[a, b]` arrays), so + * a tiny parser covers it without a YAML dependency. Inline `# comments` after a + * value are stripped. `fallbackType` is the menu id, used when the body omits + * `type:`. + */ +export function parseAgentPrompt( + text: string, + fallbackType: string, +): AgentPrompt { + const match = text.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?([\s\S]*)$/); + const frontmatter = match ? match[1] : ''; + const body = (match ? match[2] : text).trim(); + + const fields: Record = {}; + for (const rawLine of frontmatter.split(/\r?\n/)) { + const line = rawLine.replace(/\s+#.*$/, '').trim(); + if (!line || line.startsWith('#')) continue; + const kv = line.match(/^([\w-]+):\s*(.*)$/); + if (!kv) continue; + const [, key, raw] = kv; + if (raw.startsWith('[') && raw.endsWith(']')) { + fields[key] = raw + .slice(1, -1) + .split(',') + .map((s) => s.trim().replace(/^['"]|['"]$/g, '')) + .filter(Boolean); + } else { + fields[key] = raw.replace(/^['"]|['"]$/g, ''); + } + } + + const model = typeof fields.model === 'string' ? fields.model : undefined; + return { + type: typeof fields.type === 'string' ? fields.type : fallbackType, + label: typeof fields.label === 'string' ? fields.label : undefined, + flow: typeof fields.flow === 'string' ? fields.flow : undefined, + seed: fields.seed === 'true', + model, + skills: toStringArray(fields.skills), + allowedTools: toStringArray(fields.allowedTools), + disallowedTools: toStringArray(fields.disallowedTools), + dependsOn: toStringArray(fields.dependsOn), + body, + }; +} + +async function fetchText(url: string): Promise { + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Fetch ${url} failed: ${res.status} ${res.statusText}`); + } + return res.text(); +} + +/** + * Fetch the agent menu and every agent prompt it lists, parse them, and build + * the registry for one flow. Throws if the menu cannot be fetched — the + * orchestrator cannot run without its prompts. + */ +export async function loadAgentRegistry( + skillsBaseUrl: string, + flow: string, + opts?: { exclude?: readonly string[] }, +): Promise { + const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`); + const menu = JSON.parse(menuRaw) as AgentMenu; + + const prompts = await Promise.all( + (menu.agents ?? []).map(async (entry) => { + const text = await fetchText(entry.downloadUrl); + return parseAgentPrompt(text, entry.id); + }), + ); + + return buildRegistry(prompts, flow, opts); +} + +/** + * Render a task's own inputs into a section, so a fanned-out task (e.g. one + * `capture` per event) sees the specific thing it owns. Empty when there are none. + */ +function renderInputs(task: QueuedTask): string { + const entries = Object.entries(task.inputs ?? {}); + if (entries.length === 0) return ''; + const lines = entries.map(([k, v]) => `- ${k}: ${formatInputValue(v)}`); + return `## Your task input\n\n${lines.join('\n')}`; +} + +function formatInputValue(value: unknown): string { + if (typeof value === 'string') return value; + return JSON.stringify(value); +} + +/** + * Render the handoffs of a task's completed dependencies into a context section, + * so a fresh agent sees what the upstream steps did. Empty when there are none. + */ +function renderHandoffContext(task: QueuedTask, store: QueueStore): string { + const lines: string[] = []; + for (const depId of task.dependsOn) { + const dep = store.get(depId); + const handoff = store.readHandoff(depId); + if (!dep || !handoff) continue; + lines.push(`### ${dep.type}`); + lines.push(`- did: ${handoff.did}`); + lines.push(`- for you: ${handoff.forNextAgent}`); + if (handoff.filesTouched?.length) { + lines.push(`- files: ${handoff.filesTouched.join(', ')}`); + } + lines.push(''); + } + if (lines.length === 0) return ''; + return `## Context from previous steps\n\n${lines.join('\n')}`.trim(); +} + +/** + * Resolve a queued task to its run config: the prompt body (with upstream + * handoffs appended), the model, and the tool lists with orchestrator tool names + * MCP-qualified. The model precedence is enqueue override, then prompt, then + * default. Throws if no prompt is registered for the task's type. + */ +export function resolveTask( + registry: AgentRegistry, + task: QueuedTask, + store: QueueStore, +): ResolvedTask { + const prompt = registry.get(task.type); + if (!prompt) { + throw new Error(`No agent prompt registered for task type "${task.type}"`); + } + + const body = [ + renderInputs(task), + prompt.body, + renderHandoffContext(task, store), + ] + .filter(Boolean) + .join('\n\n'); + + return { + model: taskModel(registry, task), + ...agentRunTools(prompt), + prompt: body, + skills: prompt.skills, + }; +} + +/** The model a task runs on: enqueue override, then prompt frontmatter, then default. */ +export function taskModel(registry: AgentRegistry, task: QueuedTask): string { + return task.model ?? registry.get(task.type)?.model ?? DEFAULT_TASK_MODEL; +} diff --git a/src/lib/agent/agent-runner.ts b/src/lib/agent/agent-runner.ts index 2d693058..0f3269c4 100644 --- a/src/lib/agent/agent-runner.ts +++ b/src/lib/agent/agent-runner.ts @@ -372,12 +372,12 @@ async function bootstrapProgram( // orchestrator arm overwrites this with its own variant when it forks. analytics.setTag('variant', wizardMetadata.VARIANT); + // One MCP url for every region: the server resolves the user's region from + // the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is + // not needed here. const mcpUrl = session.localMcp ? 'http://localhost:8787/mcp' - : runtimeEnv('MCP_URL') || - (cloudRegion === 'eu' - ? 'https://mcp-eu.posthog.com/mcp' - : 'https://mcp.posthog.com/mcp'); + : runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp'; return { skillsBaseUrl, diff --git a/src/lib/agent/mcp-prompt-streaming.ts b/src/lib/agent/mcp-prompt-streaming.ts index dc8f8ff9..b3655f12 100644 --- a/src/lib/agent/mcp-prompt-streaming.ts +++ b/src/lib/agent/mcp-prompt-streaming.ts @@ -42,38 +42,11 @@ const MODEL = 'claude-sonnet-4-6'; // telemetry on average turn counts per prompt. const MAX_TURNS = 30; -function resolveMcpUrl(host: string): string { - const override = runtimeEnv('MCP_URL'); - if (override) return override; - // Parse the actual hostname rather than substring-matching the raw - // input. `host.includes('eu.posthog.com')` would let arbitrary URLs - // like `https://evil.eu.posthog.com.attacker.com` or - // `https://useu.posthog.commerce` route to the EU MCP endpoint - // (CodeQL: incomplete-url-substring-sanitization). Parsing into a - // hostname and checking exact match / trusted subdomain blocks both. - const hostname = parseHostname(host); - const isEu = - hostname === 'eu.posthog.com' || hostname.endsWith('.eu.posthog.com'); - return isEu - ? 'https://mcp-eu.posthog.com/mcp' - : 'https://mcp.posthog.com/mcp'; -} - -/** - * Normalize a host string into a hostname suitable for trust checks. - * Accepts either a full URL (`https://us.posthog.com`) or a bare host - * (`us.posthog.com`). Returns the hostname lowercased, or the trimmed - * input lowercased if parsing fails (defensive fallback so a malformed - * value still resolves to the safer-default US endpoint). - */ -function parseHostname(raw: string): string { - const trimmed = raw.trim().toLowerCase(); - try { - const withScheme = trimmed.includes('://') ? trimmed : `https://${trimmed}`; - return new URL(withScheme).hostname.toLowerCase(); - } catch { - return trimmed; - } +// One MCP url for every region: the server resolves the user's region from +// the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is +// not needed here. +function resolveMcpUrl(): string { + return runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp'; } /** @@ -245,7 +218,7 @@ export async function* runMcpPromptViaSdk(args: { once: true, }); - const mcpUrl = resolveMcpUrl(credentials.host); + const mcpUrl = resolveMcpUrl(); logToFile( `[runMcpPromptViaSdk] mcpUrl=${mcpUrl} model=${MODEL} resume=${ resumeSessionId ?? '(none)' From 7d45f09bc6a7d3a87dd0f71289402ce555a65c3d Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Tue, 16 Jun 2026 15:34:26 -0400 Subject: [PATCH 3/3] refactor(orchestrator): running status, queue cap, loader to lib/agent Review follow-ups on the queue and loader: - TaskStatus 'in_progress' -> 'running' (drop the snake_case outlier) - inline the nowIso() one-liner - document that dependsOn is a DAG by construction (ids point only at earlier tasks), so cycles cannot form - cap the queue at 30 tasks as a runaway backstop; real sizing rests on agent/skill design - move agent-prompt-loader from programs/orchestrator/ to lib/agent/ Co-Authored-By: Claude Fable 5 --- .../__tests__/agent-prompt-loader.test.ts | 259 -------------- .../__tests__/queue-tools.test.ts | 12 + .../orchestrator/agent-prompt-loader.ts | 336 ------------------ src/lib/programs/orchestrator/executor.ts | 2 +- .../orchestrator/orchestrator-runner.ts | 2 +- src/lib/programs/orchestrator/queue-tools.ts | 20 +- src/lib/programs/orchestrator/queue.ts | 18 +- 7 files changed, 44 insertions(+), 605 deletions(-) delete mode 100644 src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts delete mode 100644 src/lib/programs/orchestrator/agent-prompt-loader.ts diff --git a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts b/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts deleted file mode 100644 index 0d1c4966..00000000 --- a/src/lib/programs/orchestrator/__tests__/agent-prompt-loader.test.ts +++ /dev/null @@ -1,259 +0,0 @@ -import * as fs from 'fs'; -import * as os from 'os'; -import * as path from 'path'; -import { - agentRunTools, - assembleTaskPrompt, - buildRegistry, - parseAgentPrompt, - resolveTask, - taskModel, - type AgentPrompt, - type AgentRegistry, - type OrchestratorPromptContext, -} from '../agent-prompt-loader'; -import { QueueStore } from '../queue'; - -function tmpDir(): string { - return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-')); -} - -function registryOf(prompts: AgentPrompt[]): AgentRegistry { - return buildRegistry( - prompts.map((p) => ({ ...p, flow: 'test-flow' })), - 'test-flow', - ); -} - -describe('parseAgentPrompt', () => { - const sample = `--- -type: instrument-events -model: claude-sonnet-4-6 # cheapest model that succeeds -skills: [instrument-events] -allowedTools: [Read, Edit, Grep, Glob, Bash] -disallowedTools: [enqueue_task] -dependsOn: [init] ---- - -## Goal -Add at least one capture call. -`; - - it('parses frontmatter scalars and inline arrays', () => { - const p = parseAgentPrompt(sample, 'fallback'); - expect(p.type).toBe('instrument-events'); - expect(p.model).toBe('claude-sonnet-4-6'); - expect(p.skills).toEqual(['instrument-events']); - expect(p.allowedTools).toEqual(['Read', 'Edit', 'Grep', 'Glob', 'Bash']); - expect(p.disallowedTools).toEqual(['enqueue_task']); - expect(p.dependsOn).toEqual(['init']); - }); - - it('strips inline comments and keeps the body', () => { - const p = parseAgentPrompt(sample, 'fallback'); - expect(p.model).not.toContain('#'); - expect(p.body).toContain('## Goal'); - expect(p.body).not.toContain('---'); - }); - - it('falls back to the menu id when type is omitted', () => { - const p = parseAgentPrompt('---\nmodel: x\n---\nbody', 'install'); - expect(p.type).toBe('install'); - }); - - it('parses the flow from frontmatter', () => { - const p = parseAgentPrompt('---\nflow: audit\n---\nx', 'fix-events'); - expect(p.flow).toBe('audit'); - }); - - it('marks the seed from frontmatter; everything else is a task', () => { - expect(parseAgentPrompt('---\nseed: true\n---\nplan', 'planner').seed).toBe( - true, - ); - expect(parseAgentPrompt('---\nmodel: x\n---\nbody', 'install').seed).toBe( - false, - ); - }); - - it('defaults missing array fields to empty and model to undefined', () => { - const p = parseAgentPrompt('no frontmatter at all', 'stub'); - expect(p.model).toBeUndefined(); - expect(p.skills).toEqual([]); - expect(p.dependsOn).toEqual([]); - expect(p.body).toBe('no frontmatter at all'); - }); -}); - -describe('agentRunTools', () => { - it('MCP-qualifies orchestrator tools and passes native tools through', () => { - const p = parseAgentPrompt( - '---\nallowedTools: [Read, read_handoffs]\ndisallowedTools: [enqueue_task, complete_task, Bash]\n---\nx', - 't', - ); - const { allowedTools, disallowedTools } = agentRunTools(p); - expect(allowedTools).toEqual([ - 'Read', - 'mcp__posthog-wizard__read_handoffs', - ]); - expect(disallowedTools).toEqual([ - 'mcp__posthog-wizard__enqueue_task', - 'mcp__posthog-wizard__complete_task', - 'Bash', - ]); - }); -}); - -describe('buildRegistry', () => { - const prompt = (over: Partial): AgentPrompt => ({ - type: 'x', - seed: false, - skills: [], - allowedTools: [], - disallowedTools: [], - dependsOn: [], - body: 'b', - ...over, - }); - - it('scopes to one flow and keeps the seed out of the task types', () => { - const registry = buildRegistry( - [ - prompt({ type: 'plan-audit', flow: 'audit', seed: true }), - prompt({ type: 'fix-events', flow: 'audit' }), - prompt({ type: 'install', flow: 'posthog-integration' }), - prompt({ type: 'example' }), - ], - 'audit', - ); - expect(registry.types).toEqual(['fix-events']); - expect(registry.seed?.type).toBe('plan-audit'); - expect(registry.get('install')).toBeUndefined(); - // A flowless prompt (e.g. the documentation example) joins no registry. - expect(registry.get('example')).toBeUndefined(); - }); - - it('drops harness-excluded types; unrestricted runs keep them', () => { - const prompts = [ - prompt({ type: 'plan', flow: 'f', seed: true }), - prompt({ type: 'build', flow: 'f' }), - prompt({ type: 'dashboard', flow: 'f' }), - ]; - expect( - buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types, - ).toEqual(['build']); - expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']); - }); -}); - -describe('resolveTask', () => { - let dir: string; - let store: QueueStore; - - beforeEach(() => { - dir = tmpDir(); - store = new QueueStore(dir, 'run-1'); - }); - - afterEach(() => { - fs.rmSync(dir, { recursive: true, force: true }); - }); - - const prompt: AgentPrompt = { - type: 'capture', - seed: false, - model: 'claude-haiku-4-5-20251001', - skills: ['instrument-events'], - allowedTools: ['Read', 'Edit'], - disallowedTools: ['enqueue_task'], - dependsOn: ['plan-capture'], - body: '## Goal\nInstrument the planned events.', - }; - - it('throws when no prompt is registered for the type', () => { - const registry = registryOf([]); - const task = { type: 'capture', dependsOn: [] } as never; - expect(() => resolveTask(registry, task, store)).toThrow(/capture/); - }); - - it('resolves model, tools, and skills from the prompt', () => { - const registry = registryOf([prompt]); - const task = store.enqueue({ type: 'capture' }); - const resolved = resolveTask(registry, task, store); - expect(resolved.model).toBe('claude-haiku-4-5-20251001'); - expect(resolved.skills).toEqual(['instrument-events']); - expect(resolved.disallowedTools).toEqual([ - 'mcp__posthog-wizard__enqueue_task', - ]); - }); - - it('prefers the enqueue model override over the prompt model', () => { - const registry = registryOf([prompt]); - const task = store.enqueue({ type: 'capture', model: 'override-x' }); - expect(resolveTask(registry, task, store).model).toBe('override-x'); - }); - - it("appends upstream dependencies' handoffs as context", () => { - const registry = registryOf([prompt]); - const dep = store.enqueue({ type: 'plan-capture' }); - store.complete(dep.id, { - goals: 'decide events', - did: 'picked signup and purchase', - forNextAgent: 'instrument those two', - }); - const task = store.enqueue({ - type: 'capture', - dependsOn: [dep.id], - }); - const resolved = resolveTask(registry, task, store); - expect(resolved.prompt).toContain('Context from previous steps'); - expect(resolved.prompt).toContain('picked signup and purchase'); - expect(resolved.prompt).toContain('instrument those two'); - }); - - it('omits the context section when there are no handoffs', () => { - const registry = registryOf([prompt]); - const task = store.enqueue({ type: 'capture' }); - expect(resolveTask(registry, task, store).prompt).not.toContain( - 'Context from previous steps', - ); - }); -}); - -describe('taskModel', () => { - const prompt = parseAgentPrompt( - '---\nmodel: prompt-model\n---\nx', - 'capture', - ); - - it('prefers the enqueue override, then the prompt, then the default', () => { - const registry = registryOf([prompt]); - const task = { type: 'capture' }; - expect(taskModel(registry, { ...task, model: 'override' } as never)).toBe( - 'override', - ); - expect(taskModel(registry, task as never)).toBe('prompt-model'); - expect(taskModel(registryOf([]), task as never)).toBe('claude-sonnet-4-6'); - }); -}); - -describe('assembleTaskPrompt', () => { - const ctx: OrchestratorPromptContext = { - projectId: 1, - projectApiKey: 'phc_x', - host: 'https://us.posthog.com', - }; - - it('points the agent at its installed task instructions', () => { - const assembled = assembleTaskPrompt(ctx, 'do the task', [ - '.posthog-wizard/skills/capture/SKILL.md', - ]); - expect(assembled).toContain('.posthog-wizard/skills/capture/SKILL.md'); - expect(assembled).toContain('do the task'); - }); - - it('omits the instructions section when no skills are installed', () => { - expect(assembleTaskPrompt(ctx, 'do the task')).not.toContain( - 'task instructions', - ); - }); -}); diff --git a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts index 318825d2..33def856 100644 --- a/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts +++ b/src/lib/programs/orchestrator/__tests__/queue-tools.test.ts @@ -57,6 +57,18 @@ describe('checkEnqueueGuards', () => { const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' }); expect(r).toEqual({ ok: true }); }); + + it('refuses to grow the queue past the runaway cap', () => { + for (let i = 0; i < 30; i++) { + store.enqueue({ type: 'capture', inputs: { i } }); + } + const r = checkEnqueueGuards(ctx, { + type: 'init', + inputs: { i: 30 }, + reason: 'x', + }); + expect(r).toMatchObject({ ok: false, guard: 'queue-full' }); + }); }); describe('apply functions', () => { diff --git a/src/lib/programs/orchestrator/agent-prompt-loader.ts b/src/lib/programs/orchestrator/agent-prompt-loader.ts deleted file mode 100644 index 76870283..00000000 --- a/src/lib/programs/orchestrator/agent-prompt-loader.ts +++ /dev/null @@ -1,336 +0,0 @@ -/** - * Agent-prompt loader + registry. - * - * Agent prompts are the WHAT of a task: a markdown file per type, served from - * context-mill as the `agents` content type (parallel to skills). The frontmatter - * carries the artifacts the executor needs — model, the mini-skills to load (the - * HOW), the tools the task may use, and its dependencies — and the body is the - * instruction the agent reads. - * - * The registry is fetched once at startup and scoped to one flow — agents - * declare `flow` and (for the planner) `seed: true` in frontmatter, so each - * program (integration, audit, migration, ...) ships its own agent set and the - * loader stays generic. Every prompt is downloaded and parsed up front, so - * resolving a task to its run config is synchronous and adds no mid-drain - * network latency. The registry's type list also drives `enqueue_task` - * validation. - */ -import type { QueueStore, QueuedTask } from './queue'; -import type { ResolvedTask } from './executor'; - -/** - * The basics the client injects around every agent-prompt body. The `/agents/` - * files carry intent only (goal, success criteria); the wizard owns the I/O - * contract — who the agent is, how it reports, how it surfaces progress — so the - * authored prompts never restate it. - */ -export interface OrchestratorPromptContext { - projectId: number; - projectApiKey: string; - host: string; - /** Path to the framework's reference implementation (EXAMPLE.md), if available. */ - examplePath?: string; - /** Path to the framework's rules (COMMANDMENTS.md), if available. */ - commandmentsPath?: string; -} - -function projectContext(ctx: OrchestratorPromptContext): string { - return `You have access to the PostHog MCP server and the wizard tools. - -Project context: -- PostHog Project ID: ${ctx.projectId} -- PostHog public token: ${ctx.projectApiKey} -- PostHog Host: ${ctx.host}`; -} - -/** Points the agent at the framework's reference integration to learn patterns from. */ -function exampleReference(ctx: OrchestratorPromptContext): string | null { - if (!ctx.examplePath) return null; - return `A reference PostHog integration for this framework is at \`${ctx.examplePath}\`. It shows the target implementation pattern. Reference its patterns and conventions, adapting them to this codebase.`; -} - -/** The framework's rules ship with the reference skill; every task follows them. */ -function commandmentsReference(ctx: OrchestratorPromptContext): string | null { - if (!ctx.commandmentsPath) return null; - return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; -} - -const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; - -const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; - -/** - * Points the agent at its installed task instructions (the HOW). They live under - * the wizard's run dir, not `.claude/skills/`, so the SDK does not auto-load - * them — the prompt has to name them. - */ -function skillReference(paths: readonly string[]): string | null { - if (paths.length === 0) return null; - const list = paths.map((p) => `\`${p}\``).join(', '); - return `Your task instructions are at ${list}. Read them before you start and follow them. They are wizard scaffolding, not part of the project.`; -} - -/** A task agent's full prompt: injected basics, then the authored intent. */ -export function assembleTaskPrompt( - ctx: OrchestratorPromptContext, - body: string, - skillPaths: readonly string[] = [], -): string { - return [ - projectContext(ctx), - exampleReference(ctx), - commandmentsReference(ctx), - skillReference(skillPaths), - TASK_BASICS, - body, - ] - .filter(Boolean) - .join('\n\n'); -} - -/** The seed agent's full prompt: injected basics, then the authored intent. */ -export function assembleSeedPrompt( - ctx: OrchestratorPromptContext, - body: string, -): string { - return [projectContext(ctx), SEED_BASICS, body].join('\n\n'); -} - -/** Used when neither the enqueue call nor the prompt frontmatter names a model. */ -const DEFAULT_TASK_MODEL = 'claude-sonnet-4-6'; - -/** Orchestrator tools are MCP tools under the `posthog-wizard` server. Frontmatter - * names them short (e.g. `enqueue_task`); the SDK gates on the full name. */ -const ORCHESTRATOR_TOOL_PREFIX = 'mcp__posthog-wizard__'; -const ORCHESTRATOR_TOOLS = new Set([ - 'enqueue_task', - 'complete_task', - 'read_handoffs', -]); - -/** A parsed agent prompt. The frontmatter fields plus the markdown body. */ -export interface AgentPrompt { - type: string; - /** Human-readable title for the TUI; falls back to `type` when absent. */ - label?: string; - /** The flow this agent belongs to (the program id, e.g. \`posthog-integration\`). */ - flow?: string; - /** Marks the flow's planner: it seeds the queue and is not an enqueueable task. */ - seed: boolean; - model?: string; - skills: string[]; - allowedTools: string[]; - disallowedTools: string[]; - dependsOn: string[]; - body: string; -} - -export interface AgentRegistry { - /** The flow's enqueueable task types — every prompt except the seed. */ - readonly types: string[]; - /** The flow's planner, the one prompt marked `seed: true` in its frontmatter. */ - readonly seed?: AgentPrompt; - get(type: string): AgentPrompt | undefined; -} - -/** The registry for one flow's prompts. Pure; the loader feeds it the fetched set. */ -export function buildRegistry( - prompts: readonly AgentPrompt[], - flow: string, - opts?: { exclude?: readonly string[] }, -): AgentRegistry { - // The harness can exclude task types (CI excludes dashboards). An excluded - // type does not exist for the run: the seed cannot enqueue it and no agent - // is ever spun up for it. - const excluded = new Set(opts?.exclude ?? []); - const inFlow = prompts.filter( - (p) => p.flow === flow && !excluded.has(p.type), - ); - const byType = new Map(inFlow.map((p) => [p.type, p])); - return { - types: inFlow.filter((p) => !p.seed).map((p) => p.type), - seed: inFlow.find((p) => p.seed), - get: (type) => byType.get(type), - }; -} - -interface AgentMenu { - agents: { id: string; downloadUrl: string }[]; -} - -/** A native tool passes through; an orchestrator tool gets its MCP-qualified name. */ -function expandToolName(name: string): string { - return ORCHESTRATOR_TOOLS.has(name) - ? `${ORCHESTRATOR_TOOL_PREFIX}${name}` - : name; -} - -/** A prompt's allow/disallow lists with orchestrator tool names MCP-qualified. */ -export function agentRunTools(prompt: AgentPrompt): { - allowedTools: string[]; - disallowedTools: string[]; -} { - return { - allowedTools: prompt.allowedTools.map(expandToolName), - disallowedTools: prompt.disallowedTools.map(expandToolName), - }; -} - -function toStringArray(value: unknown): string[] { - if (!Array.isArray(value)) return []; - return value.filter((v): v is string => typeof v === 'string'); -} - -/** - * Parse the leading `---` frontmatter block and the markdown body. The - * frontmatter is a small, known schema (scalars and inline `[a, b]` arrays), so - * a tiny parser covers it without a YAML dependency. Inline `# comments` after a - * value are stripped. `fallbackType` is the menu id, used when the body omits - * `type:`. - */ -export function parseAgentPrompt( - text: string, - fallbackType: string, -): AgentPrompt { - const match = text.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?([\s\S]*)$/); - const frontmatter = match ? match[1] : ''; - const body = (match ? match[2] : text).trim(); - - const fields: Record = {}; - for (const rawLine of frontmatter.split(/\r?\n/)) { - const line = rawLine.replace(/\s+#.*$/, '').trim(); - if (!line || line.startsWith('#')) continue; - const kv = line.match(/^([\w-]+):\s*(.*)$/); - if (!kv) continue; - const [, key, raw] = kv; - if (raw.startsWith('[') && raw.endsWith(']')) { - fields[key] = raw - .slice(1, -1) - .split(',') - .map((s) => s.trim().replace(/^['"]|['"]$/g, '')) - .filter(Boolean); - } else { - fields[key] = raw.replace(/^['"]|['"]$/g, ''); - } - } - - const model = typeof fields.model === 'string' ? fields.model : undefined; - return { - type: typeof fields.type === 'string' ? fields.type : fallbackType, - label: typeof fields.label === 'string' ? fields.label : undefined, - flow: typeof fields.flow === 'string' ? fields.flow : undefined, - seed: fields.seed === 'true', - model, - skills: toStringArray(fields.skills), - allowedTools: toStringArray(fields.allowedTools), - disallowedTools: toStringArray(fields.disallowedTools), - dependsOn: toStringArray(fields.dependsOn), - body, - }; -} - -async function fetchText(url: string): Promise { - const res = await fetch(url); - if (!res.ok) { - throw new Error(`Fetch ${url} failed: ${res.status} ${res.statusText}`); - } - return res.text(); -} - -/** - * Fetch the agent menu and every agent prompt it lists, parse them, and build - * the registry for one flow. Throws if the menu cannot be fetched — the - * orchestrator cannot run without its prompts. - */ -export async function loadAgentRegistry( - skillsBaseUrl: string, - flow: string, - opts?: { exclude?: readonly string[] }, -): Promise { - const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`); - const menu = JSON.parse(menuRaw) as AgentMenu; - - const prompts = await Promise.all( - (menu.agents ?? []).map(async (entry) => { - const text = await fetchText(entry.downloadUrl); - return parseAgentPrompt(text, entry.id); - }), - ); - - return buildRegistry(prompts, flow, opts); -} - -/** - * Render a task's own inputs into a section, so a fanned-out task (e.g. one - * `capture` per event) sees the specific thing it owns. Empty when there are none. - */ -function renderInputs(task: QueuedTask): string { - const entries = Object.entries(task.inputs ?? {}); - if (entries.length === 0) return ''; - const lines = entries.map(([k, v]) => `- ${k}: ${formatInputValue(v)}`); - return `## Your task input\n\n${lines.join('\n')}`; -} - -function formatInputValue(value: unknown): string { - if (typeof value === 'string') return value; - return JSON.stringify(value); -} - -/** - * Render the handoffs of a task's completed dependencies into a context section, - * so a fresh agent sees what the upstream steps did. Empty when there are none. - */ -function renderHandoffContext(task: QueuedTask, store: QueueStore): string { - const lines: string[] = []; - for (const depId of task.dependsOn) { - const dep = store.get(depId); - const handoff = store.readHandoff(depId); - if (!dep || !handoff) continue; - lines.push(`### ${dep.type}`); - lines.push(`- did: ${handoff.did}`); - lines.push(`- for you: ${handoff.forNextAgent}`); - if (handoff.filesTouched?.length) { - lines.push(`- files: ${handoff.filesTouched.join(', ')}`); - } - lines.push(''); - } - if (lines.length === 0) return ''; - return `## Context from previous steps\n\n${lines.join('\n')}`.trim(); -} - -/** - * Resolve a queued task to its run config: the prompt body (with upstream - * handoffs appended), the model, and the tool lists with orchestrator tool names - * MCP-qualified. The model precedence is enqueue override, then prompt, then - * default. Throws if no prompt is registered for the task's type. - */ -export function resolveTask( - registry: AgentRegistry, - task: QueuedTask, - store: QueueStore, -): ResolvedTask { - const prompt = registry.get(task.type); - if (!prompt) { - throw new Error(`No agent prompt registered for task type "${task.type}"`); - } - - const body = [ - renderInputs(task), - prompt.body, - renderHandoffContext(task, store), - ] - .filter(Boolean) - .join('\n\n'); - - return { - model: taskModel(registry, task), - ...agentRunTools(prompt), - prompt: body, - skills: prompt.skills, - }; -} - -/** The model a task runs on: enqueue override, then prompt frontmatter, then default. */ -export function taskModel(registry: AgentRegistry, task: QueuedTask): string { - return task.model ?? registry.get(task.type)?.model ?? DEFAULT_TASK_MODEL; -} diff --git a/src/lib/programs/orchestrator/executor.ts b/src/lib/programs/orchestrator/executor.ts index abf0ed15..d8cfb976 100644 --- a/src/lib/programs/orchestrator/executor.ts +++ b/src/lib/programs/orchestrator/executor.ts @@ -101,7 +101,7 @@ export async function drainQueue( for (;;) { for (const task of store.nextRunnable()) { if (++starts > opts.maxStarts) break; - // runOne marks the task in_progress synchronously, so the next + // runOne marks the task running synchronously, so the next // nextRunnable() call no longer offers it. const p = runOne(store, runTask, task).finally(() => running.delete(task.id), diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index c16c905c..31df7372 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -43,7 +43,7 @@ import { resolveTask, taskModel, type OrchestratorPromptContext, -} from './agent-prompt-loader'; +} from '../../agent/agent-prompt-loader'; function toTodoStatus(status: TaskStatus): string { switch (status) { diff --git a/src/lib/programs/orchestrator/queue-tools.ts b/src/lib/programs/orchestrator/queue-tools.ts index 64e5bc93..5a05cda0 100644 --- a/src/lib/programs/orchestrator/queue-tools.ts +++ b/src/lib/programs/orchestrator/queue-tools.ts @@ -55,10 +55,18 @@ function dedupKey(type: string, inputs: Record): string { return `${type}::${stableStringify(inputs)}`; } +/** + * A backstop on total queue size. Tasks can enqueue tasks, so a misbehaving + * type could grow the queue without bound. Keeping the graph small is the job + * of good agent and skill design, not this number — it only stops a runaway. + * The real flow is ~9 tasks, so this sits well clear of it. + */ +const MAX_QUEUE_TASKS = 30; + /** * Validate an enqueue. Structural checks only — a real type, real dependencies, - * and not a literal duplicate. How much runs, and in what shape, is the task - * graph's business, not a knob's. + * not a literal duplicate, and not past the runaway backstop. How much runs, + * and in what shape, is the task graph's business, not a knob's. */ export function checkEnqueueGuards( ctx: OrchestratorToolsContext, @@ -66,6 +74,14 @@ export function checkEnqueueGuards( ): GuardResult { const tasks = ctx.store.list(); + if (tasks.length >= MAX_QUEUE_TASKS) { + return { + ok: false, + guard: 'queue-full', + message: `The queue already holds ${tasks.length} tasks (cap ${MAX_QUEUE_TASKS}). Refine the existing tasks rather than adding more.`, + }; + } + if (!ctx.validTypes.includes(args.type)) { return { ok: false, diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 302897e6..19545d9d 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -32,6 +32,12 @@ export interface QueuedTask { /** Human-readable label for the TUI, set by the enqueuing agent. */ label?: string; status: TaskStatus; + /** + * Ids of tasks that must finish before this one runs. Ids are generated at + * enqueue and dependsOn is never mutated, so a task can only depend on tasks + * created before it — the graph is a DAG by construction, cycles cannot + * form. Unknown ids are rejected by the enqueue_task guard. + */ dependsOn: string[]; inputs: Record; model?: string; @@ -76,6 +82,10 @@ export interface EnqueueInput { export const QUEUE_DIR_NAME = '.posthog-wizard'; const DEFAULT_MAX_ATTEMPTS = 2; +function nowIso(): string { + return new Date().toISOString(); +} + /** Every queue transition, in the order it is reflected. */ export type TransitionEvent = | 'enqueue' @@ -94,10 +104,6 @@ export interface QueueStoreOptions { onTransition?: (event: TransitionEvent, task: QueuedTask) => void; } -function nowIso(): string { - return new Date().toISOString(); -} - export class QueueStore { private tasks: QueuedTask[] = []; private readonly onTransition?: ( @@ -147,7 +153,7 @@ export class QueueStore { } /** - * True when no task is in progress and none can be started. Either everything + * True when no task is running and none can be started. Either everything * is terminal, or the only pending tasks are blocked by a failed dependency. */ isDrained(): boolean { @@ -229,7 +235,7 @@ export class QueueStore { return this.finish(id, TaskStatus.Failed, handoff); } - /** Put a failed/in-progress task back to pending for a retry within the run. */ + /** Put a failed/running task back to pending for a retry within the run. */ requeue(id: string): QueuedTask { const t = this.require(id); t.status = TaskStatus.Pending;