Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions scripts/smoke-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@ node --input-type=module -e "import '$DIST_BIN'" 2>&1 | head -5 | grep -q 'PostH
# builds and tsdown strips it; its env var name appearing in dist/*.js means
# dead-code elimination regressed and a prod surface leaked. Sourcemaps keep
# the original source, so only .js output counts.
OVERRIDE_MARKER='WIZARD_CI_FLAG_OVERRIDES'
OVERRIDE_MARKERS='WIZARD_CI_FLAG_OVERRIDES WIZARD_CI_EXCLUDE_TASKS'
if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
# CI builds must keep the path — its absence means the override silently
# stopped working and CI is back to testing live flags.
if ! grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
echo 'Smoke test failed: CI build is missing the CI flag-override path' >&2
exit 1
fi
# CI builds must keep the paths — their absence means the overrides silently
# stopped working and CI is back to testing live behavior.
for marker in $OVERRIDE_MARKERS; do
if ! grep -q "$marker" ./dist/*.js; then
echo "Smoke test failed: CI build is missing the $marker path" >&2
exit 1
fi
done
# And a real invocation must accept the env var. yargs claims every
# POSTHOG_WIZARD_-prefixed env var as a CLI option and strict-rejects
# unknown ones during command parse (--version/--help short-circuit and
Expand All @@ -44,10 +46,12 @@ if [ "${WIZARD_BUILD_NODE_ENV:-production}" = "ci" ]; then
exit 1
fi
else
if grep -q "$OVERRIDE_MARKER" ./dist/*.js; then
echo 'Smoke test failed: CI flag-override code leaked into a production build' >&2
exit 1
fi
for marker in $OVERRIDE_MARKERS; do
if grep -q "$marker" ./dist/*.js; then
echo "Smoke test failed: $marker code leaked into a production build" >&2
exit 1
fi
done
fi

# ── 3. --ci rejected in production builds ────────────────────────────────────
Expand Down
1 change: 1 addition & 0 deletions src/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type RuntimeEnvKey =
// Deliberately NOT POSTHOG_WIZARD_-prefixed: yargs .env('POSTHOG_WIZARD')
// would claim it as an unknown CLI option and strict-reject the run.
| 'WIZARD_CI_FLAG_OVERRIDES'
| 'WIZARD_CI_EXCLUDE_TASKS'
// Wizard CLI configuration (yargs POSTHOG_WIZARD_ prefix)
| 'POSTHOG_WIZARD_BENCHMARK_CONFIG'
| 'POSTHOG_WIZARD_BENCHMARK_FILE'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import {
type AgentRegistry,
type OrchestratorPromptContext,
} from '../agent-prompt-loader';
import { QueueStore } from '../queue';
import { QueueStore } from '../../programs/orchestrator/queue';

function tmpDir(): string {
return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-loader-test-'));
Expand Down Expand Up @@ -131,6 +131,18 @@ describe('buildRegistry', () => {
// A flowless prompt (e.g. the documentation example) joins no registry.
expect(registry.get('example')).toBeUndefined();
});

it('drops harness-excluded types; unrestricted runs keep them', () => {
const prompts = [
prompt({ type: 'plan', flow: 'f', seed: true }),
prompt({ type: 'build', flow: 'f' }),
prompt({ type: 'dashboard', flow: 'f' }),
];
expect(
buildRegistry(prompts, 'f', { exclude: ['dashboard'] }).types,
).toEqual(['build']);
expect(buildRegistry(prompts, 'f').types).toEqual(['build', 'dashboard']);
});
});

describe('resolveTask', () => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
* network latency. The registry's type list also drives `enqueue_task`
* validation.
*/
import type { QueueStore, QueuedTask } from './queue';
import type { ResolvedTask } from './executor';
import type { QueueStore, QueuedTask } from '../programs/orchestrator/queue';
import type { ResolvedTask } from '../programs/orchestrator/executor';

/**
* The basics the client injects around every agent-prompt body. The `/agents/`
Expand Down Expand Up @@ -137,8 +137,15 @@ export interface AgentRegistry {
export function buildRegistry(
prompts: readonly AgentPrompt[],
flow: string,
opts?: { exclude?: readonly string[] },
): AgentRegistry {
const inFlow = prompts.filter((p) => p.flow === flow);
// The harness can exclude task types (CI excludes dashboards). An excluded
// type does not exist for the run: the seed cannot enqueue it and no agent
// is ever spun up for it.
const excluded = new Set(opts?.exclude ?? []);
const inFlow = prompts.filter(
(p) => p.flow === flow && !excluded.has(p.type),
);
const byType = new Map(inFlow.map((p) => [p.type, p]));
return {
types: inFlow.filter((p) => !p.seed).map((p) => p.type),
Expand Down Expand Up @@ -238,6 +245,7 @@ async function fetchText(url: string): Promise<string> {
export async function loadAgentRegistry(
skillsBaseUrl: string,
flow: string,
opts?: { exclude?: readonly string[] },
): Promise<AgentRegistry> {
const menuRaw = await fetchText(`${skillsBaseUrl}/agent-menu.json`);
const menu = JSON.parse(menuRaw) as AgentMenu;
Expand All @@ -249,7 +257,7 @@ export async function loadAgentRegistry(
}),
);

return buildRegistry(prompts, flow);
return buildRegistry(prompts, flow, opts);
}

/**
Expand Down
8 changes: 4 additions & 4 deletions src/lib/agent/agent-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -372,12 +372,12 @@ async function bootstrapProgram(
// orchestrator arm overwrites this with its own variant when it forks.
analytics.setTag('variant', wizardMetadata.VARIANT);

// One MCP url for every region: the server resolves the user's region from
// the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is
// not needed here.
const mcpUrl = session.localMcp
? 'http://localhost:8787/mcp'
: runtimeEnv('MCP_URL') ||
(cloudRegion === 'eu'
? 'https://mcp-eu.posthog.com/mcp'
: 'https://mcp.posthog.com/mcp');
: runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp';

return {
skillsBaseUrl,
Expand Down
39 changes: 6 additions & 33 deletions src/lib/agent/mcp-prompt-streaming.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,38 +42,11 @@ const MODEL = 'claude-sonnet-4-6';
// telemetry on average turn counts per prompt.
const MAX_TURNS = 30;

function resolveMcpUrl(host: string): string {
const override = runtimeEnv('MCP_URL');
if (override) return override;
// Parse the actual hostname rather than substring-matching the raw
// input. `host.includes('eu.posthog.com')` would let arbitrary URLs
// like `https://evil.eu.posthog.com.attacker.com` or
// `https://useu.posthog.commerce` route to the EU MCP endpoint
// (CodeQL: incomplete-url-substring-sanitization). Parsing into a
// hostname and checking exact match / trusted subdomain blocks both.
const hostname = parseHostname(host);
const isEu =
hostname === 'eu.posthog.com' || hostname.endsWith('.eu.posthog.com');
return isEu
? 'https://mcp-eu.posthog.com/mcp'
: 'https://mcp.posthog.com/mcp';
}

/**
* Normalize a host string into a hostname suitable for trust checks.
* Accepts either a full URL (`https://us.posthog.com`) or a bare host
* (`us.posthog.com`). Returns the hostname lowercased, or the trimmed
* input lowercased if parsing fails (defensive fallback so a malformed
* value still resolves to the safer-default US endpoint).
*/
function parseHostname(raw: string): string {
const trimmed = raw.trim().toLowerCase();
try {
const withScheme = trimmed.includes('://') ? trimmed : `https://${trimmed}`;
return new URL(withScheme).hostname.toLowerCase();
} catch {
return trimmed;
}
// One MCP url for every region: the server resolves the user's region from
// the bearer token, so the EU subdomain (a Claude Code OAuth workaround) is
// not needed here.
function resolveMcpUrl(): string {
return runtimeEnv('MCP_URL') || 'https://mcp.posthog.com/mcp';
}

/**
Expand Down Expand Up @@ -245,7 +218,7 @@ export async function* runMcpPromptViaSdk(args: {
once: true,
});

const mcpUrl = resolveMcpUrl(credentials.host);
const mcpUrl = resolveMcpUrl();
logToFile(
`[runMcpPromptViaSdk] mcpUrl=${mcpUrl} model=${MODEL} resume=${
resumeSessionId ?? '(none)'
Expand Down
12 changes: 12 additions & 0 deletions src/lib/programs/orchestrator/__tests__/queue-tools.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@ describe('checkEnqueueGuards', () => {
const r = checkEnqueueGuards(ctx, { type: 'init', reason: 'x' });
expect(r).toEqual({ ok: true });
});

it('refuses to grow the queue past the runaway cap', () => {
for (let i = 0; i < 30; i++) {
store.enqueue({ type: 'capture', inputs: { i } });
}
const r = checkEnqueueGuards(ctx, {
type: 'init',
inputs: { i: 30 },
reason: 'x',
});
expect(r).toMatchObject({ ok: false, guard: 'queue-full' });
});
});

describe('apply functions', () => {
Expand Down
2 changes: 1 addition & 1 deletion src/lib/programs/orchestrator/executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ export async function drainQueue(
for (;;) {
for (const task of store.nextRunnable()) {
if (++starts > opts.maxStarts) break;
// runOne marks the task in_progress synchronously, so the next
// runOne marks the task running synchronously, so the next
// nextRunnable() call no longer offers it.
const p = runOne(store, runTask, task).finally(() =>
running.delete(task.id),
Expand Down
4 changes: 3 additions & 1 deletion src/lib/programs/orchestrator/orchestrator-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import { detectNodePackageManagers } from '../../detection/package-manager';
import { installSkillById } from '../../wizard-tools';
import { getUI } from '../../../ui';
import { analytics } from '../../../utils/analytics';
import { ciExcludedTaskTypes } from '../../../utils/ci-flag-overrides';
import { logToFile } from '../../../utils/debug';
import type { ProgramConfig } from '../program-step';
import type { BootstrapResult } from '../../agent/agent-runner';
Expand All @@ -42,7 +43,7 @@ import {
resolveTask,
taskModel,
type OrchestratorPromptContext,
} from './agent-prompt-loader';
} from '../../agent/agent-prompt-loader';

function toTodoStatus(status: TaskStatus): string {
switch (status) {
Expand Down Expand Up @@ -88,6 +89,7 @@ export async function runOrchestrator(
const registry = await loadAgentRegistry(
boot.skillsBaseUrl,
programConfig.id,
{ exclude: ciExcludedTaskTypes() },
);
const seedPrompt = registry.seed;
if (!seedPrompt) {
Expand Down
20 changes: 18 additions & 2 deletions src/lib/programs/orchestrator/queue-tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,33 @@ function dedupKey(type: string, inputs: Record<string, unknown>): string {
return `${type}::${stableStringify(inputs)}`;
}

/**
* A backstop on total queue size. Tasks can enqueue tasks, so a misbehaving
* type could grow the queue without bound. Keeping the graph small is the job
* of good agent and skill design, not this number — it only stops a runaway.
* The real flow is ~9 tasks, so this sits well clear of it.
*/
const MAX_QUEUE_TASKS = 30;

/**
* Validate an enqueue. Structural checks only — a real type, real dependencies,
* and not a literal duplicate. How much runs, and in what shape, is the task
* graph's business, not a knob's.
* not a literal duplicate, and not past the runaway backstop. How much runs,
* and in what shape, is the task graph's business, not a knob's.
*/
export function checkEnqueueGuards(
ctx: OrchestratorToolsContext,
args: EnqueueArgs,
): GuardResult {
const tasks = ctx.store.list();

if (tasks.length >= MAX_QUEUE_TASKS) {
return {
ok: false,
guard: 'queue-full',
message: `The queue already holds ${tasks.length} tasks (cap ${MAX_QUEUE_TASKS}). Refine the existing tasks rather than adding more.`,
};
}

if (!ctx.validTypes.includes(args.type)) {
return {
ok: false,
Expand Down
18 changes: 12 additions & 6 deletions src/lib/programs/orchestrator/queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ export interface QueuedTask {
/** Human-readable label for the TUI, set by the enqueuing agent. */
label?: string;
status: TaskStatus;
/**
* Ids of tasks that must finish before this one runs. Ids are generated at
* enqueue and dependsOn is never mutated, so a task can only depend on tasks
* created before it — the graph is a DAG by construction, cycles cannot
* form. Unknown ids are rejected by the enqueue_task guard.
*/
dependsOn: string[];
inputs: Record<string, unknown>;
model?: string;
Expand Down Expand Up @@ -76,6 +82,10 @@ export interface EnqueueInput {
export const QUEUE_DIR_NAME = '.posthog-wizard';
const DEFAULT_MAX_ATTEMPTS = 2;

function nowIso(): string {
return new Date().toISOString();
}

/** Every queue transition, in the order it is reflected. */
export type TransitionEvent =
| 'enqueue'
Expand All @@ -94,10 +104,6 @@ export interface QueueStoreOptions {
onTransition?: (event: TransitionEvent, task: QueuedTask) => void;
}

function nowIso(): string {
return new Date().toISOString();
}

export class QueueStore {
private tasks: QueuedTask[] = [];
private readonly onTransition?: (
Expand Down Expand Up @@ -147,7 +153,7 @@ export class QueueStore {
}

/**
* True when no task is in progress and none can be started. Either everything
* True when no task is running and none can be started. Either everything
* is terminal, or the only pending tasks are blocked by a failed dependency.
*/
isDrained(): boolean {
Expand Down Expand Up @@ -229,7 +235,7 @@ export class QueueStore {
return this.finish(id, TaskStatus.Failed, handoff);
}

/** Put a failed/in-progress task back to pending for a retry within the run. */
/** Put a failed/running task back to pending for a retry within the run. */
requeue(id: string): QueuedTask {
const t = this.require(id);
t.status = TaskStatus.Pending;
Expand Down
36 changes: 35 additions & 1 deletion src/utils/__tests__/ci-flag-overrides.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import { applyCiFlagOverrides } from '@utils/ci-flag-overrides';
import {
applyCiFlagOverrides,
ciExcludedTaskTypes,
} from '@utils/ci-flag-overrides';

jest.mock('@utils/debug', () => ({
logToFile: jest.fn(),
Expand Down Expand Up @@ -61,3 +64,34 @@ describe('applyCiFlagOverrides', () => {
});
});
});

describe('ciExcludedTaskTypes', () => {
afterEach(() => {
delete process.env.WIZARD_CI_EXCLUDE_TASKS;
});

it('is empty when nothing is excluded', () => {
expect(ciExcludedTaskTypes()).toEqual([]);
});

it('parses the comma-separated list, ignoring stray whitespace', () => {
process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard, report ,';
expect(ciExcludedTaskTypes()).toEqual(['dashboard', 'report']);
});

it('is inert in production builds', () => {
const prevNodeEnv = process.env.NODE_ENV;
process.env.NODE_ENV = 'production';
process.env.WIZARD_CI_EXCLUDE_TASKS = 'dashboard';
let result: readonly string[] | undefined;
jest.isolateModules(() => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const prod = require('@utils/ci-flag-overrides') as {
ciExcludedTaskTypes: typeof ciExcludedTaskTypes;
};
result = prod.ciExcludedTaskTypes();
});
process.env.NODE_ENV = prevNodeEnv;
expect(result).toEqual([]);
});
});
Loading
Loading