diff --git a/src/lib/agent/__tests__/agent-prompt-loader.test.ts b/src/lib/agent/__tests__/agent-prompt-loader.test.ts index a0a2b04a..d2ad5b7a 100644 --- a/src/lib/agent/__tests__/agent-prompt-loader.test.ts +++ b/src/lib/agent/__tests__/agent-prompt-loader.test.ts @@ -217,6 +217,46 @@ describe('resolveTask', () => { 'Context from previous steps', ); }); + + it('includes transitive ancestors, not just direct dependencies', () => { + const registry = registryOf([prompt]); + // install -> capture -> (this task). The task depends only on capture, but + // install's context must still reach it so nothing is silently lost. + const install = store.enqueue({ type: 'install' }); + store.complete(install.id, { + goals: 'declare the SDK', + did: 'added posthog to the manifest', + forNextAgent: 'SDK is declared, not yet installed', + }); + const capture = store.enqueue({ type: 'capture', dependsOn: [install.id] }); + store.complete(capture.id, { + goals: 'instrument events', + did: 'added capture calls', + forNextAgent: 'events are in', + }); + const task = store.enqueue({ type: 'capture', dependsOn: [capture.id] }); + const { prompt: out } = resolveTask(registry, task, store); + expect(out).toContain('added posthog to the manifest'); // transitive + expect(out).toContain('added capture calls'); // direct + }); + + it('lists each ancestor once for diamond dependencies', () => { + const registry = registryOf([prompt]); + const install = store.enqueue({ type: 'install' }); + store.complete(install.id, { + goals: 'g', + did: 'manifest entry added', + forNextAgent: 'n', + }); + const a = store.enqueue({ type: 'identify', dependsOn: [install.id] }); + store.complete(a.id, { goals: 'g', did: 'a-did', forNextAgent: 'n' }); + const b = store.enqueue({ type: 'identify', dependsOn: [install.id] }); + store.complete(b.id, { goals: 'g', did: 'b-did', forNextAgent: 'n' }); + // Resolved task must be a registered type (capture); its ancestors need not be. + const task = store.enqueue({ type: 'capture', dependsOn: [a.id, b.id] }); + const { prompt: out } = resolveTask(registry, task, store); + expect(out.match(/manifest entry added/g)).toHaveLength(1); + }); }); describe('taskModel', () => { diff --git a/src/lib/agent/agent-prompt-loader.ts b/src/lib/agent/agent-prompt-loader.ts index 1fe487b1..8b83ff9a 100644 --- a/src/lib/agent/agent-prompt-loader.ts +++ b/src/lib/agent/agent-prompt-loader.ts @@ -55,7 +55,7 @@ function commandmentsReference(ctx: OrchestratorPromptContext): string | null { return `Framework rules for this integration are at \`${ctx.commandmentsPath}\`. Read them before you edit and follow them.`; } -const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only within this project's own directory; nothing outside it is part of your task. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; +const TASK_BASICS = `You are one isolated task in a larger PostHog workflow, run as a fresh agent with no memory of the other tasks beyond the context you are given. Do only your task, then report exactly once by calling complete_task with a structured handoff: what your goal was, what you did, and what the next agent should know. When you are given context from previous steps, trust it — those agents already did their work, so do not re-verify or re-read what their handoffs tell you. Build on it and move fast. Read a file before you edit it, so your own changes do not duplicate what is already there. Work only inside this project's own directory: never read, list, or search (find, ls, grep, glob) outside it — not the OS, not other projects, not global package caches. If your task seems to need something outside this directory, it does not — skip that part and say so in your handoff rather than hunting across the filesystem. If your task does not apply to this project — there is genuinely nothing for it to do — report it with status \`skipped\` and say why, rather than marking it done.`; const SEED_BASICS = `You are the orchestrator. Plan the work and seed the queue with enqueue_task — each call returns an id you can pass as a dependency to a later task. Give each task a short label for the UI — the action in a few words, not file names, class names, or other specifics. You are not a task yourself: do not call complete_task and do not edit the project.`; @@ -277,14 +277,37 @@ function formatInputValue(value: unknown): string { } /** - * Render the handoffs of a task's completed dependencies into a context section, - * so a fresh agent sees what the upstream steps did. Empty when there are none. + * The ids of every task `task` transitively depends on — the full upstream + * chain, not just direct dependencies — ordered roots-first, each once. A `seen` + * set dedupes diamonds and guards against cycles. + */ +function ancestorIds(task: QueuedTask, store: QueueStore): string[] { + const seen = new Set(); + const ordered: string[] = []; + const visit = (id: string): void => { + if (seen.has(id)) return; + seen.add(id); + const t = store.get(id); + if (!t) return; + for (const dep of t.dependsOn) visit(dep); // ancestors before dependents + ordered.push(id); + }; + for (const dep of task.dependsOn) visit(dep); + return ordered; +} + +/** + * Render the handoffs of every step `task` transitively depends on into a context + * section, so a fresh agent sees the whole upstream chain — not just its direct + * dependencies. Reliability over token economy: a step must never have to + * re-discover what any ancestor already established just because an intermediate + * handoff happened to omit it. Empty when there are no completed ancestors. */ function renderHandoffContext(task: QueuedTask, store: QueueStore): string { const lines: string[] = []; - for (const depId of task.dependsOn) { - const dep = store.get(depId); - const handoff = store.readHandoff(depId); + for (const id of ancestorIds(task, store)) { + const dep = store.get(id); + const handoff = store.readHandoff(id); if (!dep || !handoff) continue; lines.push(`### ${dep.type}`); lines.push(`- did: ${handoff.did}`); diff --git a/src/lib/programs/orchestrator/__tests__/queue.test.ts b/src/lib/programs/orchestrator/__tests__/queue.test.ts index 7f34f283..3b493d0f 100644 --- a/src/lib/programs/orchestrator/__tests__/queue.test.ts +++ b/src/lib/programs/orchestrator/__tests__/queue.test.ts @@ -3,6 +3,7 @@ import * as os from 'os'; import * as path from 'path'; import { QueueStore, + QUEUE_DIR_NAME, type QueueFile, type TaskHandoff, } from '@lib/programs/orchestrator/queue'; @@ -28,6 +29,15 @@ describe('QueueStore', () => { fs.rmSync(dir, { recursive: true, force: true }); }); + it('drops a self-explaining .DELETE-ME.md in the cache folder', () => { + const note = fs.readFileSync( + path.join(dir, QUEUE_DIR_NAME, '.DELETE-ME.md'), + 'utf8', + ); + expect(note).toContain('safely delete'); + expect(note).toContain(`${QUEUE_DIR_NAME}/`); + }); + it('enqueues a pending task with defaults', () => { const t = q.enqueue({ type: 'install' }); expect(t.status).toBe('pending'); diff --git a/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts new file mode 100644 index 00000000..544f17b2 --- /dev/null +++ b/src/lib/programs/orchestrator/__tests__/run-metrics.test.ts @@ -0,0 +1,68 @@ +import { RunMetrics } from '@lib/programs/orchestrator/run-metrics'; + +describe('RunMetrics', () => { + it('reports time to first start and first completion from run start', () => { + const m = new RunMetrics(0); + m.recordStart(100); + m.recordComplete(300); + m.recordStart(1000); + m.recordComplete(1100); + const s = m.summary(); + expect(s.time_to_first_task_ms).toBe(100); + expect(s.time_to_first_completion_ms).toBe(300); + }); + + it('max_gap_ms is the longest silence across all visible transitions', () => { + const m = new RunMetrics(0); + m.recordStart(100); // visible @100 + m.recordComplete(300); // gap 200 + m.recordStart(1000); // gap 700 ← longest + m.recordComplete(1100); // gap 100 + expect(m.summary().max_gap_ms).toBe(700); + }); + + it('recordStart returns ms_since_run_start and the gap from the previous start', () => { + const m = new RunMetrics(0); + expect(m.recordStart(100)).toEqual({ + ms_since_run_start: 100, + gap_since_prev_start_ms: undefined, + }); + expect(m.recordStart(1000)).toEqual({ + ms_since_run_start: 1000, + gap_since_prev_start_ms: 900, + }); + }); + + it('reports undefined timings for a run with no transitions, not zero', () => { + const s = new RunMetrics(0).summary(); + expect(s.time_to_first_task_ms).toBeUndefined(); + expect(s.time_to_first_completion_ms).toBeUndefined(); + expect(s.max_gap_ms).toBeUndefined(); + }); + + it('a single started-but-unfinished task reports a real zero gap and no completion', () => { + const m = new RunMetrics(0); + m.recordStart(50); + const s = m.summary(); + expect(s.time_to_first_task_ms).toBe(50); + expect(s.time_to_first_completion_ms).toBeUndefined(); + expect(s.max_gap_ms).toBe(0); // one visible transition → genuine 0, not undefined + }); + + it('counts a retry stall (start to re-start) as silence', () => { + const m = new RunMetrics(0); + m.recordStart(0); + // the task ended without reporting and was requeued (invisible), then + // re-started 5s later — that stall is a silence the user sees. + m.recordStart(5000); + expect(m.summary().max_gap_ms).toBe(5000); + }); + + it('treats skip and fail as visible transitions for gap tracking', () => { + const m = new RunMetrics(0); + m.recordStart(0); + m.recordTerminal(2000); // skip or fail, gap 2000 + m.recordStart(2500); // gap 500 + expect(m.summary().max_gap_ms).toBe(2000); + }); +}); diff --git a/src/lib/programs/orchestrator/orchestrator-runner.ts b/src/lib/programs/orchestrator/orchestrator-runner.ts index 31df7372..6353d8c8 100644 --- a/src/lib/programs/orchestrator/orchestrator-runner.ts +++ b/src/lib/programs/orchestrator/orchestrator-runner.ts @@ -20,7 +20,7 @@ import { } from '../../agent/agent-interface'; import { OutroKind, type WizardSession } from '../../wizard-session'; import { detectNodePackageManagers } from '../../detection/package-manager'; -import { installSkillById } from '../../wizard-tools'; +import { installSkillById, fetchSkillMenu } from '../../wizard-tools'; import { getUI } from '../../../ui'; import { analytics } from '../../../utils/analytics'; import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides'; @@ -35,6 +35,7 @@ import { type QueuedTask, } from './queue'; import { drainQueue, type RunTask } from './executor'; +import { RunMetrics } from './run-metrics'; import { agentRunTools, assembleSeedPrompt, @@ -74,6 +75,27 @@ function sessionRunOptions(session: WizardSession): WizardRunOptions { }; } +/** + * The framework reference is the full `integration` skill. `session.skillId` is + * the bare framework (e.g. `django`), but the skill menu ids it as + * `integration-`. Resolve to the menu id: exact `integration-` + * (the 1:1 frameworks — django, python, flask, …), else the first granular variant + * under it (e.g. `integration-nextjs-app-router`). Undefined when none exists. + */ +async function resolveReferenceSkillId( + skillsBaseUrl: string, + framework: string, +): Promise { + const menu = await fetchSkillMenu(skillsBaseUrl); + if (!menu) return undefined; + const ids = Object.values(menu.categories) + .flat() + .map((s) => s.id); + const exact = `integration-${framework}`; + if (ids.includes(exact)) return exact; + return ids.find((id) => id.startsWith(`integration-${framework}-`)); +} + export async function runOrchestrator( session: WizardSession, programConfig: ProgramConfig, @@ -107,8 +129,7 @@ export async function runOrchestrator( // queue transitions, with the resolved model so cheap work is attributable // to cheap models. const runStartMs = Date.now(); - let firstStartMs: number | undefined; - let lastStartMs: number | undefined; + const metrics = new RunMetrics(runStartMs); const durationMs = (t: QueuedTask) => t.startedAt && t.finishedAt ? Date.parse(t.finishedAt) - Date.parse(t.startedAt) @@ -129,31 +150,28 @@ export async function runOrchestrator( dynamic: task.enqueuedBy !== 'orchestrator', }); break; - case 'start': { - const now = Date.now(); + case 'start': analytics.wizardCapture('orchestrator task started', { ...base, - ms_since_run_start: now - runStartMs, - gap_since_prev_start_ms: - lastStartMs === undefined ? undefined : now - lastStartMs, + ...metrics.recordStart(Date.now()), }); - firstStartMs ??= now; - lastStartMs = now; break; - } case 'complete': + metrics.recordComplete(Date.now()); analytics.wizardCapture('orchestrator task completed', { ...base, duration_ms: durationMs(task), }); break; case 'skip': + metrics.recordTerminal(Date.now()); analytics.wizardCapture('orchestrator task skipped', { ...base, duration_ms: durationMs(task), }); break; case 'fail': + metrics.recordTerminal(Date.now()); analytics.wizardCapture('orchestrator task failed', { ...base, duration_ms: durationMs(task), @@ -172,9 +190,12 @@ export async function runOrchestrator( // skill — only the example file is read, when the agent's prompt points at it. let examplePath: string | undefined; let commandmentsPath: string | undefined; - if (session.skillId) { + const referenceSkillId = session.skillId + ? await resolveReferenceSkillId(boot.skillsBaseUrl, session.skillId) + : undefined; + if (referenceSkillId) { const ref = await installSkillById( - session.skillId, + referenceSkillId, session.installDir, boot.skillsBaseUrl, path.join(QUEUE_DIR_NAME, 'reference'), @@ -189,8 +210,14 @@ export async function runOrchestrator( commandmentsPath = commandments; } } else { - logToFile(`[orchestrator] reference example unavailable: ${ref.kind}`); + logToFile( + `[orchestrator] reference unavailable: ${ref.kind} (${referenceSkillId})`, + ); } + } else if (session.skillId) { + logToFile( + `[orchestrator] no integration skill for framework "${session.skillId}"`, + ); } // The client injects the basics (project context + the I/O contract) around @@ -353,11 +380,20 @@ export async function runOrchestrator( try { await drainQueue(store, runTask); } finally { - // Success or failure, the installed task instructions never outlive the run. - rmSync(path.join(session.installDir, taskSkillsRoot), { - recursive: true, - force: true, - }); + // Success or failure, no run artifact outlives the run — wipe the whole + // cache folder (queue, handoffs, reference example, installed task + // instructions). The .DELETE-ME.md inside is the fallback if we don't. + try { + rmSync(path.join(session.installDir, QUEUE_DIR_NAME), { + recursive: true, + force: true, + }); + } catch (err) { + analytics.captureException( + err instanceof Error ? err : new Error(String(err)), + { step: 'orchestrator_cache_cleanup' }, + ); + } } renderQueue(); @@ -372,8 +408,11 @@ export async function runOrchestrator( tasks_failed: summary.failed, tasks_skipped: summary.skipped, total_duration_ms: Date.now() - runStartMs, - time_to_first_task_ms: - firstStartMs === undefined ? undefined : firstStartMs - runStartMs, + ...metrics.summary(), + dynamic_enqueue_count: store + .list() + .filter((t) => t.enqueuedBy !== 'orchestrator').length, + retried_task_count: store.list().filter((t) => t.attempts > 1).length, }); // The build step flags any unresolved conflict in its handoff; surface the diff --git a/src/lib/programs/orchestrator/queue.ts b/src/lib/programs/orchestrator/queue.ts index 19545d9d..4aa7c368 100644 --- a/src/lib/programs/orchestrator/queue.ts +++ b/src/lib/programs/orchestrator/queue.ts @@ -6,9 +6,10 @@ * returns every pending task whose dependencies are satisfied, and how many of * those run at once is decided by the task graph, not the queue. * - * Every transition rewrites `/.posthog-wizard/queue.json`, a small - * file holding the whole queue, handoffs included. Today it is the run's - * log and the report's source; later it is the resume point. + * Every transition rewrites `/.posthog-wizard-cache/queue.json`, a + * small file holding the whole queue, handoffs included. It is the run's log + * and the report's source. The whole cache folder is run-scoped and wiped when + * the run ends. */ import * as fs from 'fs'; import * as path from 'path'; @@ -79,13 +80,22 @@ export interface EnqueueInput { enqueuedBy?: string; } -export const QUEUE_DIR_NAME = '.posthog-wizard'; +export const QUEUE_DIR_NAME = '.posthog-wizard-cache'; const DEFAULT_MAX_ATTEMPTS = 2; function nowIso(): string { return new Date().toISOString(); } +/** Dropped in the cache folder so an orphaned copy explains itself. */ +const DELETE_ME_FILE = '.DELETE-ME.md'; +const DELETE_ME_BODY = `# Safe to delete + +This folder contains run artifacts from the PostHog Wizard. This should have +been deleted if the Wizard has finished running. If this wasn't deleted for +some reason, you can safely delete the entire \`${QUEUE_DIR_NAME}/\` folder. +`; + /** Every queue transition, in the order it is reflected. */ export type TransitionEvent = | 'enqueue' @@ -120,6 +130,7 @@ export class QueueStore { const dir = path.join(installDir, QUEUE_DIR_NAME); this.queuePath = path.join(dir, 'queue.json'); fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(path.join(dir, DELETE_ME_FILE), DELETE_ME_BODY); } // ── Reads ─────────────────────────────────────────────────────────── diff --git a/src/lib/programs/orchestrator/run-metrics.ts b/src/lib/programs/orchestrator/run-metrics.ts new file mode 100644 index 00000000..8ea82415 --- /dev/null +++ b/src/lib/programs/orchestrator/run-metrics.ts @@ -0,0 +1,85 @@ +/** + * Responsiveness metrics for an orchestrator run. + * + * Responsiveness is the experiment's headline: how quickly the user sees the + * first progress, and whether progress stays steady (no long silences). The math + * is accumulated from queue transitions but kept here, pure and time-injected, so + * it is unit-testable away from the runner. Wall-clock times are passed in as + * milliseconds; the caller owns the clock. + */ + +export interface RunMetricsSummary { + /** Run start → first task started. */ + time_to_first_task_ms?: number; + /** Run start → first task completed (the first visible "done"). */ + time_to_first_completion_ms?: number; + /** Longest silence between two consecutive user-visible transitions. */ + max_gap_ms?: number; +} + +/** The per-event timing the `orchestrator task started` event reports. */ +export interface StartTiming { + ms_since_run_start: number; + gap_since_prev_start_ms?: number; +} + +export class RunMetrics { + private firstStartMs?: number; + private lastStartMs?: number; + private firstCompleteMs?: number; + private lastVisibleMs?: number; + private maxGapMs = 0; + + constructor(private readonly runStartMs: number) {} + + /** A task started. Returns the per-start-event timing for the start event. */ + recordStart(nowMs: number): StartTiming { + const timing: StartTiming = { + ms_since_run_start: nowMs - this.runStartMs, + gap_since_prev_start_ms: + this.lastStartMs === undefined ? undefined : nowMs - this.lastStartMs, + }; + this.firstStartMs ??= nowMs; + this.lastStartMs = nowMs; + this.markVisible(nowMs); + return timing; + } + + /** A task completed. */ + recordComplete(nowMs: number): void { + this.firstCompleteMs ??= nowMs; + this.markVisible(nowMs); + } + + /** A task reached a terminal non-complete state the user sees (skip/fail). */ + recordTerminal(nowMs: number): void { + this.markVisible(nowMs); + } + + /** + * The run-level responsiveness summary. Timings are `undefined` when the + * relevant transition never happened (e.g. a run that started no task), so a + * no-task run stays distinguishable from a genuine zero. + */ + summary(): RunMetricsSummary { + return { + time_to_first_task_ms: + this.firstStartMs === undefined + ? undefined + : this.firstStartMs - this.runStartMs, + time_to_first_completion_ms: + this.firstCompleteMs === undefined + ? undefined + : this.firstCompleteMs - this.runStartMs, + max_gap_ms: this.lastVisibleMs === undefined ? undefined : this.maxGapMs, + }; + } + + /** requeue is not user-visible, so a retry stall counts as silence here. */ + private markVisible(nowMs: number): void { + if (this.lastVisibleMs !== undefined) { + this.maxGapMs = Math.max(this.maxGapMs, nowMs - this.lastVisibleMs); + } + this.lastVisibleMs = nowMs; + } +} diff --git a/src/ui/tui/primitives/LogViewer.tsx b/src/ui/tui/primitives/LogViewer.tsx index 9fa02cd4..8277802c 100644 --- a/src/ui/tui/primitives/LogViewer.tsx +++ b/src/ui/tui/primitives/LogViewer.tsx @@ -15,8 +15,10 @@ import { useState, useEffect } from 'react'; import * as fs from 'fs'; import { useStdoutDimensions } from '@ui/tui/hooks/useStdoutDimensions'; -/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar + tab bar */ -const CHROME_ROWS = 8; +/** Rows consumed by TitleBar + spacer + ScreenContainer padding + status bar + + * tab bar, with a couple rows of headroom so the tail never crowds the status + * bar below it. */ +const CHROME_ROWS = 10; /** Bytes read from the end of the log per refresh — large enough to contain * any practical visible window of lines, small enough to allocate cheaply. */