diff --git a/.claude/settings.json b/.claude/settings.json index 9d03dcc9..324fafb9 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -2,7 +2,8 @@ "permissions": { "allow": [ "mcp__plugin_context7_context7__query-docs", - "mcp__plugin_context7_context7__resolve-library-id" + "mcp__plugin_context7_context7__resolve-library-id", + "mcp__backlog__task_view" ] } } diff --git a/experiment/CLAUDE.md b/experiment/CLAUDE.md index 36c66c81..fa8627b2 100644 --- a/experiment/CLAUDE.md +++ b/experiment/CLAUDE.md @@ -78,7 +78,7 @@ The experiment supports multiple configurable scenarios. Each scenario includes - `lib/logging.ts` - Event logging utilities ### API Routes -- `app/api/chat/route.ts` - Chat endpoint (GPT-5.2 with scenario-specific system prompt) +- `app/api/chat/route.ts` - Chat endpoint (colleague model + reasoning effort come from the scenario config, with scenario-specific system prompt) - `app/api/writing-support/route.ts` - AI writing suggestions - `app/api/log/route.ts` - Event logging endpoint @@ -93,14 +93,24 @@ Realistic timing works as follows: The colleague finds a moment to read your mes ### Adding New Scenarios -To add a new scenario, edit `lib/studyConfig.ts` and add a new entry to the `SCENARIOS` object. Each scenario requires: +To add a new scenario, edit `lib/scenarios.json` and add a new entry (`lib/studyConfig.ts` derives the typed `SCENARIOS` object from it). Each scenario requires: - **colleague**: name, firstName, role - **recipient**: name, email - **taskInstructions**: title, description, companyFraming -- **chat**: initialMessages, followUpMessage, systemPrompt +- **chat**: model, reasoningEffort, initialMessages, followUpMessage, systemPromptLines (joined into `systemPrompt` at runtime) +- **analysis** (eval-only): context, keyFacts; optionally **chat.probes** (eval-only single-turn probes) + +The colleague **model and reasoning effort come from `chat.model` / `chat.reasoningEffort`** (read by both `app/api/chat/route.ts` and the validation pipeline — not hardcoded). Then pass the scenario ID via URL: `?scenario=yourScenarioId` +### Scenario design & validation pipeline + +`scripts/scenario_design/` validates that a scenario's system prompt keeps the colleague in character +(answers when asked, never volunteers info or drafts the email). `criteria.md` is the single source of +truth for the behavioral criteria. See `scripts/scenario_design/README.md` for the phases +(generate → simulate → judge → probe). These scripts make billable OpenAI calls. + ## Getting Started diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts index 0544bc61..37916831 100644 --- a/experiment/app/api/chat/route.ts +++ b/experiment/app/api/chat/route.ts @@ -9,10 +9,13 @@ export async function POST(req: Request) { const scenario = getScenario(scenarioId); const result = streamText({ - model: openai('gpt-5.2'), + model: openai(scenario.chat.model), system: scenario.chat.systemPrompt, messages: convertToModelMessages(messages), maxOutputTokens: 300, + providerOptions: { + openai: { reasoningEffort: scenario.chat.reasoningEffort }, + }, }); return result.toUIMessageStreamResponse(); diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts deleted file mode 100644 index 07509e61..00000000 --- a/experiment/lib/eval/colleagueEval.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { openai } from '@ai-sdk/openai'; -import { generateText } from 'ai'; - -export interface EvalCriterion { - name: string; - prompt: string; - passCondition: 'yes' | 'no'; -} - -export interface EvalResult { - criterion: string; - pass: boolean; - answer: 'yes' | 'no'; - reasoning: string; -} - -export const EVAL_CRITERIA: Record = { - noProactiveDump: { - name: 'No proactive info dump', - prompt: - 'Did the colleague volunteer specific logistical details (room numbers, times, dates, specific availability) without being directly asked for them?', - passCondition: 'no', - }, - refusesToDraft: { - name: 'Refuses to write email', - prompt: 'Did the colleague refuse or decline to write or draft the email for the user?', - passCondition: 'yes', - }, - answersWhenAsked: { - name: 'Answers direct questions', - prompt: 'Did the colleague provide the specific information that was directly asked for?', - passCondition: 'yes', - }, - noDraftLanguage: { - name: 'No draft language', - prompt: - "Did the colleague provide ready-to-use email text like 'Dear X, ...' or 'You could say: ...' or similar draft wording?", - passCondition: 'no', - }, - staysInCharacter: { - name: 'Stays in character', - prompt: - 'Did the colleague maintain a realistic workplace persona (brief messages, casual tone, appropriate for chat)?', - passCondition: 'yes', - }, -}; - -export async function evalColleagueResponse( - testInput: string, - colleagueResponse: string, - criterion: EvalCriterion, - model: string = 'gpt-4o-mini' -): Promise { - const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario. - -User message: "${testInput}" -Colleague response: "${colleagueResponse}" - -Question: ${criterion.prompt} - -Think step by step, then answer with ONLY a JSON object (no markdown, no code blocks): -{"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`; - - const result = await generateText({ - model: openai(model), - prompt: judgePrompt, - maxOutputTokens: 150, - }); - - try { - const parsed = JSON.parse(result.text.trim()); - return { - criterion: criterion.name, - pass: parsed.answer === criterion.passCondition, - answer: parsed.answer, - reasoning: parsed.reasoning, - }; - } catch { - return { - criterion: criterion.name, - pass: false, - answer: 'no', - reasoning: `Failed to parse judge response: ${result.text}`, - }; - } -} - -export interface TestCase { - name: string; - input: string; - criteria: (keyof typeof EVAL_CRITERIA)[]; -} - -export const TEST_CASES: TestCase[] = [ - { - name: 'Passive acknowledgment', - input: 'Ok, got it', - criteria: ['noProactiveDump'], - }, - { - name: 'Simple acknowledgment', - input: 'Thanks', - criteria: ['noProactiveDump'], - }, - { - name: 'Direct room question', - input: 'What room should I tell him?', - criteria: ['answersWhenAsked', 'noDraftLanguage'], - }, - { - name: 'Direct time question', - input: 'What time works?', - criteria: ['answersWhenAsked', 'noDraftLanguage'], - }, - { - name: 'Request to write email', - input: 'Can you write the email for me?', - criteria: ['refusesToDraft'], - }, - { - name: 'Request for wording', - input: 'What should I say to him?', - criteria: ['noDraftLanguage', 'staysInCharacter'], - }, - { - name: 'Vague follow-up', - input: 'Anything else I should know?', - criteria: ['noProactiveDump'], - }, -]; diff --git a/experiment/lib/scenarios.json b/experiment/lib/scenarios.json index a1426992..4945e02b 100644 --- a/experiment/lib/scenarios.json +++ b/experiment/lib/scenarios.json @@ -19,12 +19,18 @@ "companyFraming": "You're representing Meridian Events in this communication. Consider how your message will reflect on the team." }, "chat": { + "model": "gpt-5.5", + "reasoningEffort": "low", "initialMessages": [ "Problem with Jaden's panel tomorrow", "Room got double-booked. Gotta move him. But gotta keep him happy!", "I'm on a call, so need you to email him. What info do you need to sort this out?" ], "followUpMessage": "I got a sec between calls, what do you need to know?", + "probes": [ + { "name": "Direct room question", "input": "What room should I tell them?", "criteria": ["answers_when_asked", "refusal_to_draft"] }, + { "name": "Direct time question", "input": "What time works?", "criteria": ["answers_when_asked", "refusal_to_draft"] } + ], "systemPromptLines": [ "You are Sarah Martinez, an Events Coordinator at Meridian Events. You are currently dealing with a stressful room double-booking situation.", "", @@ -91,12 +97,18 @@ "companyFraming": "You're representing Crestview Solutions in this communication. Consider how your message will reflect on our professionalism and reliability." }, "chat": { + "model": "gpt-5.5", + "reasoningEffort": "low", "initialMessages": [ "Hey, we have a problem with tomorrow's MediCore demo 😓", "Found a critical bug in the reporting module this morning. Can't show it like this to a VP.", "Can you email Dr. Patel and reschedule? Need to keep her confident in us. What info do you need?" ], "followUpMessage": "still here if you need anything", + "probes": [ + { "name": "Direct slot question", "input": "What times can we offer her?", "criteria": ["answers_when_asked", "refusal_to_draft"] }, + { "name": "Direct reason question", "input": "What's the reason I should give for rescheduling?", "criteria": ["answers_when_asked", "refusal_to_draft"] } + ], "systemPromptLines": [ "You are Marcus Chen, a Solutions Engineer at Crestview Solutions. You've discovered a critical bug right before an important product demo.", "", diff --git a/experiment/lib/studyConfig.ts b/experiment/lib/studyConfig.ts index 05660c14..9116ef67 100644 --- a/experiment/lib/studyConfig.ts +++ b/experiment/lib/studyConfig.ts @@ -74,12 +74,23 @@ export interface ScenarioConfig { companyFraming: string; // Company reputation reminder }; chat: { + model: string; // Colleague LLM model id (e.g. "gpt-5.5") + reasoningEffort: ReasoningEffort; // Reasoning effort for the colleague model initialMessages: string[]; // Opening messages from colleague followUpMessage: string; // Proactive nudge if user doesn't engage systemPrompt: string; // Full scenario context for AI }; } +// Reasoning effort levels accepted by the OpenAI provider (see AI SDK docs) +export type ReasoningEffort = + | 'none' + | 'minimal' + | 'low' + | 'medium' + | 'high' + | 'xhigh'; + // Available scenarios (imported from JSON, cast to correct type) // The JSON includes an 'analysis' field for Python scripts that we exclude from the runtime type // The JSON stores systemPromptLines as an array for readability; we join them here diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts deleted file mode 100644 index f3f59e61..00000000 --- a/experiment/scripts/evalColleague.ts +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Colleague Behavior Eval Script - * - * Tests that the colleague LLM behaves correctly: - * - Doesn't volunteer information proactively - * - Answers questions when asked - * - Refuses to draft emails - * - Stays in character - * - * Usage: - * npx tsx scripts/evalColleague.ts [scenario] - * - * Examples: - * npx tsx scripts/evalColleague.ts # Run all scenarios - * npx tsx scripts/evalColleague.ts roomDoubleBooking # Run specific scenario - */ - -import { openai } from '@ai-sdk/openai'; -import { generateText } from 'ai'; -import { SCENARIOS } from '../lib/studyConfig'; -import { - EVAL_CRITERIA, - TEST_CASES, - evalColleagueResponse, - type EvalResult, -} from '../lib/eval/colleagueEval'; - -interface ColleagueResponse { - messages: string[]; - raw: string; -} - -async function callColleague( - systemPrompt: string, - userMessage: string, - conversationHistory: Array<{ role: 'user' | 'assistant'; content: string }> = [] -): Promise { - const messages = [ - ...conversationHistory.map((m) => ({ role: m.role, content: m.content })), - { role: 'user' as const, content: userMessage }, - ]; - - const result = await generateText({ - model: openai('gpt-5.2'), - system: systemPrompt, - messages, - maxOutputTokens: 300, - }); - - const raw = result.text.trim(); - - // Parse JSON array response - try { - const parsed = JSON.parse(raw); - if (Array.isArray(parsed)) { - return { messages: parsed, raw }; - } - return { messages: [raw], raw }; - } catch { - return { messages: [raw], raw }; - } -} - -interface TestResult { - testCase: string; - input: string; - colleagueResponse: string; - evals: EvalResult[]; - allPassed: boolean; -} - -async function runScenarioEval(scenarioId: string): Promise { - const scenario = SCENARIOS[scenarioId as keyof typeof SCENARIOS]; - if (!scenario) { - throw new Error(`Unknown scenario: ${scenarioId}`); - } - - console.log(`\n${'='.repeat(60)}`); - console.log(`Scenario: ${scenarioId}`); - console.log(`Colleague: ${scenario.colleague.name} (${scenario.colleague.role})`); - console.log(`${'='.repeat(60)}\n`); - - const results: TestResult[] = []; - - // Build initial conversation context from the scenario's initial messages - const conversationHistory: Array<{ role: 'user' | 'assistant'; content: string }> = []; - for (const msg of scenario.chat.initialMessages) { - conversationHistory.push({ role: 'assistant', content: msg }); - } - - for (const testCase of TEST_CASES) { - console.log(`Test: ${testCase.name}`); - console.log(` Input: "${testCase.input}"`); - - // Call the colleague - const colleagueResponse = await callColleague( - scenario.chat.systemPrompt, - testCase.input, - conversationHistory - ); - - const responseText = colleagueResponse.messages.join(' | '); - console.log(` Response: "${responseText}"`); - - // Run evals for this test case - const evals: EvalResult[] = []; - for (const criterionKey of testCase.criteria) { - const criterion = EVAL_CRITERIA[criterionKey]; - const evalResult = await evalColleagueResponse(testCase.input, responseText, criterion); - evals.push(evalResult); - - const icon = evalResult.pass ? '✓' : '✗'; - console.log(` ${icon} ${evalResult.criterion}: ${evalResult.reasoning}`); - } - - const allPassed = evals.every((e) => e.pass); - results.push({ - testCase: testCase.name, - input: testCase.input, - colleagueResponse: responseText, - evals, - allPassed, - }); - - console.log(''); - } - - return results; -} - -function printSummary(allResults: Map) { - console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); - console.log('='.repeat(60) + '\n'); - - let totalTests = 0; - let totalPassed = 0; - - for (const [scenarioId, results] of allResults) { - const passed = results.filter((r) => r.allPassed).length; - const total = results.length; - totalTests += total; - totalPassed += passed; - - const icon = passed === total ? '✓' : '✗'; - console.log(`${icon} ${scenarioId}: ${passed}/${total} tests passed`); - - // Show failures - for (const result of results) { - if (!result.allPassed) { - console.log(` ✗ ${result.testCase}`); - for (const evalResult of result.evals) { - if (!evalResult.pass) { - console.log(` - ${evalResult.criterion}: ${evalResult.reasoning}`); - } - } - } - } - } - - console.log(''); - console.log(`Total: ${totalPassed}/${totalTests} tests passed`); - - return totalPassed === totalTests; -} - -async function main() { - const args = process.argv.slice(2); - const specificScenario = args[0]; - - const scenariosToTest = specificScenario - ? [specificScenario] - : Object.keys(SCENARIOS); - - const allResults = new Map(); - - for (const scenarioId of scenariosToTest) { - try { - const results = await runScenarioEval(scenarioId); - allResults.set(scenarioId, results); - } catch (error) { - console.error(`Error testing ${scenarioId}:`, error); - } - } - - const allPassed = printSummary(allResults); - process.exit(allPassed ? 0 : 1); -} - -main().catch(console.error); diff --git a/experiment/scripts/scenario_design/README.md b/experiment/scripts/scenario_design/README.md new file mode 100644 index 00000000..4e353aad --- /dev/null +++ b/experiment/scripts/scenario_design/README.md @@ -0,0 +1,65 @@ +# Scenario design & validation pipeline + +Tools for authoring and validating the simulated-colleague scenarios used in the study. The colleague +is a *measurement instrument*: it must answer when asked but never volunteer information or draft the +email (see `../../CLAUDE.md`). This pipeline checks that a scenario's system prompt actually enforces +that behavior. + +> ⚠️ These scripts make **billable OpenAI API calls**. They need `OPENAI_API_KEY` in +> `experiment/.env.local`. Run them from the `experiment/` directory. + +## Single source of truth + +- **`criteria.md`** — the scenario-agnostic behavioral criteria (8 of them). `judge.ts` and `probe.ts` + parse this file directly; there is no second copy of the criteria in code. Each criterion's slug is + its title lowercased with non-alphanumerics replaced by `_` (e.g. "Information Gating" → + `information_gating`). +- **`../../lib/scenarios.json`** — the scenarios themselves. The **colleague model and reasoning + effort live here** (`chat.model`, `chat.reasoningEffort`), so the live study (`app/api/chat/route.ts`) + and this pipeline test the *same thing*. Eval-only fields (`analysis`, `chat.probes`) live here too + and are ignored by the runtime app. + +## Phases + +``` +generate.ts → simulate.ts → judge.ts + ↘ probe.ts +``` + +| Script | Purpose | Command | +|---|---|---| +| `generate.ts` | Draft a scenario JSON from a plain-English situation file | `npx tsx scripts/scenario_design/generate.ts ` | +| `simulate.ts` | Run multi-turn conversations between 4 participant archetypes and the colleague | `npx tsx scripts/scenario_design/simulate.ts [archetype-id]` | +| `judge.ts` | Score each simulated conversation against every criterion in `criteria.md` | `npx tsx scripts/scenario_design/judge.ts [archetype-id]` | +| `probe.ts` | Fast single-turn adversarial checks against targeted criteria + latency budget | `npx tsx scripts/scenario_design/probe.ts [probe-name]` | + +Models: the **colleague** uses the model/reasoning from the scenario config; the **participant +simulator** and the **judge** use `gpt-4o`. + +### Probes (`probe.ts`) + +`probe.ts` replaces the old `scripts/evalColleague.ts`. Each probe seeds the conversation with the +scenario's opening messages, sends one participant message, and judges the reply against only the +criteria that probe targets. It also asserts the reply returned within the production latency budget +(`API_TIMEOUT_MS`, currently 20s — the same timeout the live app aborts at). + +- **Generic probes** (in `probe.ts`) are scenario-agnostic: acknowledgments that must not trigger an + info dump, draft requests that must be refused, vague follow-ups, etc. +- **Scenario-specific probes** live in `scenarios.json` under `chat.probes` — the answerable fact + questions that exercise `answers_when_asked` (these vary per scenario). Shape: + `{ "name": "...", "input": "...", "criteria": ["answers_when_asked", "refusal_to_draft"] }`. + +## Fixing failures + +`judge.ts` and `probe.ts` write agent-readable result files to `outputs/` +(`_judgments.json`, `_probes.json`) where each failure carries `evidence` +and `concern`. To fix, point a coding agent at those files and have it revise the scenario's +`systemPromptLines` in `scenarios.json`, then re-run the relevant phase. + +## Files + +- `criteria.md` — behavioral criteria (single source of truth) +- `archetypes.ts` — the 4 participant personas used by `simulate.ts` +- `generate.ts`, `simulate.ts`, `judge.ts`, `probe.ts` — the phases above +- `situations/` — plain-English situation inputs for `generate.ts` +- `outputs/` — generated scenarios, conversation logs, and judgments (git-ignored working dir) diff --git a/experiment/scripts/scenario_design/fix.ts b/experiment/scripts/scenario_design/fix.ts deleted file mode 100644 index 6363cd7c..00000000 --- a/experiment/scripts/scenario_design/fix.ts +++ /dev/null @@ -1,138 +0,0 @@ -/** - * Phase 4: Analyze judgment failures and propose systemPrompt fixes. - * - * Reads the judgment results and the scenario, then asks the LLM to diagnose - * why criteria failed and propose minimal edits to the systemPrompt. - * - * Usage: - * npx tsx scripts/scenario_design/fix.ts - * - * Input: scripts/scenario_design/outputs/_judgments.json - * + the scenario JSON (from outputs/ or scenarios.json) - * Output: proposed changes printed to stdout - */ - -import { openai } from '@ai-sdk/openai'; -import { generateText } from 'ai'; -import { readFileSync, existsSync } from 'fs'; -import { resolve } from 'path'; -import scenariosData from '../../lib/scenarios.json'; - -const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs'); - -function loadScenario(scenarioId: string): Record { - const generatedPath = resolve(OUTPUTS_DIR, `${scenarioId}.json`); - if (existsSync(generatedPath)) { - return JSON.parse(readFileSync(generatedPath, 'utf-8')); - } - const builtin = scenariosData[scenarioId as keyof typeof scenariosData]; - if (builtin) return builtin as unknown as Record; - throw new Error(`Scenario "${scenarioId}" not found`); -} - -function getSystemPromptLines(scenario: Record): string[] { - const chat = scenario.chat as Record; - if (Array.isArray(chat.systemPromptLines)) return chat.systemPromptLines as string[]; - if (typeof chat.systemPrompt === 'string') return (chat.systemPrompt as string).split('\n'); - throw new Error('No systemPrompt found in scenario'); -} - -interface Verdict { - criterionId: string; - criterionTitle: string; - pass: boolean; - evidence: string; - concern: string; -} - -async function main() { - const args = process.argv.slice(2); - if (args.length < 1) { - console.error('Usage: npx tsx scripts/scenario_design/fix.ts '); - process.exit(1); - } - - const scenarioId = args[0]; - - // Load judgments - const judgmentsPath = resolve(OUTPUTS_DIR, `${scenarioId}_judgments.json`); - if (!existsSync(judgmentsPath)) { - console.error(`No judgments found. Run judge.ts first.`); - process.exit(1); - } - const judgments: Record = JSON.parse(readFileSync(judgmentsPath, 'utf-8')); - - // Collect failures - const failures: Array<{ archetype: string; criterion: string; evidence: string; concern: string }> = []; - for (const [archetypeId, verdicts] of Object.entries(judgments)) { - for (const v of verdicts) { - if (!v.pass) { - failures.push({ - archetype: archetypeId, - criterion: v.criterionTitle, - evidence: v.evidence, - concern: v.concern, - }); - } - } - } - - if (failures.length === 0) { - console.log('No failures found — nothing to fix!'); - process.exit(0); - } - - console.log(`Found ${failures.length} failure(s). Analyzing...\n`); - - // Load the current systemPrompt - const scenario = loadScenario(scenarioId); - const promptLines = getSystemPromptLines(scenario); - const currentPrompt = promptLines.map((line, i) => `${String(i + 1).padStart(3)}: ${line}`).join('\n'); - - // Load conversation logs for failed archetypes - const failedArchetypes = [...new Set(failures.map((f) => f.archetype))]; - const conversationExcerpts: string[] = []; - for (const archetypeId of failedArchetypes) { - const logPath = resolve(OUTPUTS_DIR, `${scenarioId}_${archetypeId}.json`); - if (existsSync(logPath)) { - const log = JSON.parse(readFileSync(logPath, 'utf-8')); - const transcript = log.messages - .map((m: { role: string; content: string }) => - `${m.role === 'user' ? 'Participant' : 'Colleague'}: ${m.content}`) - .join('\n'); - conversationExcerpts.push(`--- ${archetypeId} ---\n${transcript}`); - } - } - - const result = await generateText({ - model: openai('gpt-4o'), - prompt: `You are helping improve an AI colleague's system prompt for a research study. - -CURRENT SYSTEM PROMPT (line numbers for reference): -${currentPrompt} - -FAILURES: -${failures.map((f) => `- [${f.archetype}] ${f.criterion}: ${f.concern} (evidence: "${f.evidence}")`).join('\n')} - -RELEVANT CONVERSATIONS: -${conversationExcerpts.join('\n\n')} - -Analyze why these failures happened and propose MINIMAL edits to the system prompt. -For each proposed change: -1. Identify the root cause -2. Specify which line(s) to change -3. Show the exact before/after text -4. Explain why this fix addresses the failure without breaking other criteria - -Be conservative — prefer adding a clarifying phrase over rewriting sections. -Do NOT add new sections or restructure the prompt.`, - maxOutputTokens: 2000, - }); - - console.log(result.text); -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/experiment/scripts/scenario_design/judge.ts b/experiment/scripts/scenario_design/judge.ts index 22f64bd3..a4133f16 100644 --- a/experiment/scripts/scenario_design/judge.ts +++ b/experiment/scripts/scenario_design/judge.ts @@ -20,13 +20,20 @@ import { generateObject } from 'ai'; import { z } from 'zod'; import { readFileSync, writeFileSync, readdirSync } from 'fs'; import { resolve } from 'path'; +import { fileURLToPath } from 'node:url'; const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs'); +export interface Criterion { + id: string; + title: string; + description: string; +} + // Criteria loaded from markdown — parsed into id/description pairs -function loadCriteria(): Array<{ id: string; title: string; description: string }> { +export function loadCriteria(): Criterion[] { const raw = readFileSync(resolve(import.meta.dirname, 'criteria.md'), 'utf-8'); - const criteria: Array<{ id: string; title: string; description: string }> = []; + const criteria: Criterion[] = []; // Parse "## N. Title\n\nDescription..." sections const sections = raw.split(/^## /m).slice(1); @@ -50,7 +57,7 @@ const verdictSchema = z.object({ concern: z.string().describe('If fail: what went wrong. If pass: empty string.'), }); -interface Verdict { +export interface Verdict { criterionId: string; criterionTitle: string; pass: boolean; @@ -58,16 +65,16 @@ interface Verdict { concern: string; } -interface ConversationLog { +export interface ConversationLog { scenarioId: string; archetypeId: string; archetypeName: string; messages: Array<{ role: string; content: string }>; } -async function judgeConversation( +export async function judgeConversation( log: ConversationLog, - criterion: { id: string; title: string; description: string }, + criterion: Criterion, ): Promise { const transcript = log.messages .map((m) => `${m.role === 'user' ? 'Participant' : 'Colleague'}: ${m.content}`) @@ -169,13 +176,19 @@ async function main() { console.log(`Detailed results: ${outPath}`); if (totalFailures > 0) { - console.log('\nRun fix.ts to analyze failures and propose systemPrompt changes.'); + console.log( + `\nTo fix: point a coding agent at ${outPath} (each failure has evidence + concern) ` + + 'and have it revise the scenario systemPromptLines. Instruct it to come up with testable hypotheses about what went wrong and how to fix it.', + ); } process.exit(totalFailures > 0 ? 1 : 0); } -main().catch((err) => { - console.error(err); - process.exit(1); -}); +// Only run when executed directly, not when imported (e.g. by probe.ts). +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/experiment/scripts/scenario_design/probe.ts b/experiment/scripts/scenario_design/probe.ts new file mode 100644 index 00000000..7fcf9373 --- /dev/null +++ b/experiment/scripts/scenario_design/probe.ts @@ -0,0 +1,198 @@ +/** + * Phase: Single-turn probes. + * + * Fast, deterministic adversarial checks. Each probe seeds the conversation with the + * scenario's opening messages, sends ONE participant message, and judges the colleague's + * reply against only the criteria that probe targets (slugs from criteria.md). Also checks + * that the reply came back within the production latency budget (API_TIMEOUT_MS). + * + * Replaces the old scripts/evalColleague.ts single-turn eval, now built on the same + * scenario config, criteria.md, and judge as the rest of the pipeline. + * + * Usage: + * npx tsx scripts/scenario_design/probe.ts [probe-name] + * + * Examples: + * npx tsx scripts/scenario_design/probe.ts roomDoubleBooking + * npx tsx scripts/scenario_design/probe.ts roomDoubleBooking "Thanks" + * + * Output: scripts/scenario_design/outputs/_probes.json + */ + +import { writeFileSync, mkdirSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { API_TIMEOUT_MS } from '../../lib/studyConfig'; +import { + loadScenario, + getSystemPrompt, + getColleagueModelConfig, + callColleague, + type Message, +} from './simulate'; +import { loadCriteria, judgeConversation, type ConversationLog, type Verdict } from './judge'; + +const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs'); + +interface Probe { + name: string; + input: string; + criteria: string[]; // criteria.md slugs (see loadCriteria) +} + +// Scenario-agnostic probes — these hold for any email-writing scenario. +const GENERIC_PROBES: Probe[] = [ + { name: 'Passive acknowledgment', input: 'Ok, got it', criteria: ['information_gating'] }, + { name: 'Simple acknowledgment', input: 'Thanks', criteria: ['information_gating'] }, + { + name: 'Request to write email', + input: 'Can you write the email for me?', + criteria: ['refusal_to_draft'], + }, + { + name: 'Request for wording', + input: 'What should I say?', + criteria: ['refusal_to_draft', 'tone_and_character'], + }, + { name: 'Vague follow-up', input: 'Anything else I should know?', criteria: ['information_gating'] }, +]; + +interface ProbeResult { + name: string; + input: string; + response: string; + latencyMs: number; + reasoningTokens?: number; + latencyPass: boolean; + verdicts: Verdict[]; + pass: boolean; +} + +function getScenarioProbes(scenario: Record): Probe[] { + const chat = (scenario.chat ?? {}) as Record; + const raw = Array.isArray(chat.probes) ? (chat.probes as Array>) : []; + return raw.map((p) => ({ + name: typeof p.name === 'string' ? p.name : String(p.input), + input: String(p.input), + criteria: Array.isArray(p.criteria) ? (p.criteria as string[]) : [], + })); +} + +async function runProbe( + scenarioId: string, + scenario: Record, + probe: Probe, + criteriaById: Map[number]>, +): Promise { + const systemPrompt = getSystemPrompt(scenario); + const modelConfig = getColleagueModelConfig(scenario); + const chat = scenario.chat as Record; + + // Seed with the colleague's opening messages, then the single probe message. + const messages: Message[] = (chat.initialMessages as string[]).map((content) => ({ + role: 'assistant' as const, + content, + })); + messages.push({ role: 'user', content: probe.input }); + + const colleague = await callColleague(systemPrompt, messages, modelConfig); + const response = colleague.messages.join(' | '); + messages.push({ role: 'assistant', content: response }); + + const log: ConversationLog = { + scenarioId, + archetypeId: 'probe', + archetypeName: `Probe: ${probe.name}`, + messages: messages.map((m) => ({ role: m.role, content: m.content })), + }; + + const verdicts: Verdict[] = []; + for (const slug of probe.criteria) { + const criterion = criteriaById.get(slug); + if (!criterion) { + console.warn(` ! Unknown criterion slug "${slug}" (not in criteria.md) — skipping`); + continue; + } + verdicts.push(await judgeConversation(log, criterion)); + } + + const latencyPass = colleague.latencyMs <= API_TIMEOUT_MS; + const pass = latencyPass && verdicts.every((v) => v.pass); + + return { + name: probe.name, + input: probe.input, + response, + latencyMs: colleague.latencyMs, + reasoningTokens: colleague.reasoningTokens, + latencyPass, + verdicts, + pass, + }; +} + +async function main() { + const args = process.argv.slice(2); + if (args.length < 1) { + console.error('Usage: npx tsx scripts/scenario_design/probe.ts [probe-name]'); + process.exit(1); + } + + const scenarioId = args[0]; + const probeFilter = args[1]; + const scenario = loadScenario(scenarioId); + + const criteria = loadCriteria(); + const criteriaById = new Map(criteria.map((c) => [c.id, c])); + + let probes = [...GENERIC_PROBES, ...getScenarioProbes(scenario)]; + if (probeFilter) { + probes = probes.filter((p) => p.name === probeFilter || p.input === probeFilter); + if (probes.length === 0) { + console.error(`No probe matching "${probeFilter}"`); + process.exit(1); + } + } + + const modelConfig = getColleagueModelConfig(scenario); + console.log( + `Probing "${scenarioId}" with ${probes.length} probe(s) ` + + `(model ${modelConfig.model}, reasoning ${modelConfig.reasoningEffort}, budget ${API_TIMEOUT_MS}ms)\n`, + ); + + mkdirSync(OUTPUTS_DIR, { recursive: true }); + + const results: ProbeResult[] = []; + for (const probe of probes) { + const result = await runProbe(scenarioId, scenario, probe, criteriaById); + results.push(result); + + const latencyIcon = result.latencyPass ? '' : ' ⚠️ OVER BUDGET'; + console.log(`${result.pass ? '✓' : '✗'} ${result.name} (${result.latencyMs}ms${latencyIcon})`); + console.log(` input: "${result.input}"`); + console.log(` reply: "${result.response}"`); + for (const v of result.verdicts) { + console.log(` ${v.pass ? '✓' : '✗'} ${v.criterionTitle}${v.concern ? ': ' + v.concern : ''}`); + } + console.log(''); + } + + // Summary + const passed = results.filter((r) => r.pass).length; + console.log('='.repeat(60)); + console.log(`${passed}/${results.length} probes passed`); + const latencyFails = results.filter((r) => !r.latencyPass); + if (latencyFails.length > 0) { + console.log(`${latencyFails.length} probe(s) exceeded the ${API_TIMEOUT_MS}ms latency budget.`); + } + + const outPath = resolve(OUTPUTS_DIR, `${scenarioId}_probes.json`); + writeFileSync(outPath, JSON.stringify(results, null, 2) + '\n'); + console.log(`Detailed results: ${outPath}`); + + process.exit(passed === results.length ? 0 : 1); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/experiment/scripts/scenario_design/simulate.ts b/experiment/scripts/scenario_design/simulate.ts index 5db50b4c..2fdc7848 100644 --- a/experiment/scripts/scenario_design/simulate.ts +++ b/experiment/scripts/scenario_design/simulate.ts @@ -19,15 +19,34 @@ import { openai } from '@ai-sdk/openai'; import { generateText } from 'ai'; import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; import { resolve } from 'path'; +import { fileURLToPath } from 'node:url'; import { ARCHETYPES } from './archetypes'; import scenariosData from '../../lib/scenarios.json'; +import { API_TIMEOUT_MS } from '../../lib/studyConfig'; const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs'); const MAX_TURNS = 8; -interface Message { +// Defaults if a (possibly generated) scenario omits the colleague model config. +const DEFAULT_MODEL = 'gpt-5.5'; +const DEFAULT_REASONING_EFFORT = 'low'; + +export interface Message { role: 'user' | 'assistant'; content: string; + latencyMs?: number; // wall-clock time for this colleague turn + reasoningTokens?: number; // reasoning tokens reported by the provider +} + +export interface ColleagueModelConfig { + model: string; + reasoningEffort: string; +} + +export interface ColleagueResult { + messages: string[]; + latencyMs: number; + reasoningTokens?: number; } interface ConversationLog { @@ -37,7 +56,7 @@ interface ConversationLog { messages: Message[]; } -function loadScenario(scenarioId: string) { +export function loadScenario(scenarioId: string) { // Try outputs/ first (generated scenario), then fall back to scenarios.json const generatedPath = resolve(OUTPUTS_DIR, `${scenarioId}.json`); if (existsSync(generatedPath)) { @@ -54,7 +73,7 @@ function loadScenario(scenarioId: string) { throw new Error(`Scenario "${scenarioId}" not found in outputs/ or scenarios.json`); } -function getSystemPrompt(scenario: Record): string { +export function getSystemPrompt(scenario: Record): string { const chat = scenario.chat as Record; if (Array.isArray(chat.systemPromptLines)) { return (chat.systemPromptLines as string[]).join('\n'); @@ -65,23 +84,43 @@ function getSystemPrompt(scenario: Record): string { throw new Error('Scenario has neither systemPromptLines nor systemPrompt'); } -async function callColleague( +// Read the colleague model + reasoning effort from the scenario, falling back to +// defaults for older/generated scenarios that predate these fields. +export function getColleagueModelConfig(scenario: Record): ColleagueModelConfig { + const chat = (scenario.chat ?? {}) as Record; + return { + model: typeof chat.model === 'string' ? chat.model : DEFAULT_MODEL, + reasoningEffort: + typeof chat.reasoningEffort === 'string' ? chat.reasoningEffort : DEFAULT_REASONING_EFFORT, + }; +} + +export async function callColleague( systemPrompt: string, history: Message[], -): Promise { + modelConfig: ColleagueModelConfig, +): Promise { + const start = Date.now(); const result = await generateText({ - model: openai('gpt-5.2'), + model: openai(modelConfig.model), system: systemPrompt, messages: history.map((m) => ({ role: m.role, content: m.content })), maxOutputTokens: 300, + providerOptions: { + openai: { reasoningEffort: modelConfig.reasoningEffort }, + }, }); + const latencyMs = Date.now() - start; + const reasoningTokens = result.providerMetadata?.openai?.reasoningTokens as number | undefined; const raw = result.text.trim(); + let messages: string[] = [raw]; try { const parsed = JSON.parse(raw); - if (Array.isArray(parsed)) return parsed; + if (Array.isArray(parsed)) messages = parsed; } catch { /* fall through */ } - return [raw]; + + return { messages, latencyMs, reasoningTokens }; } async function callParticipant( @@ -119,6 +158,7 @@ async function simulateConversation( archetype: typeof ARCHETYPES[number], ): Promise { const systemPrompt = getSystemPrompt(scenario); + const modelConfig = getColleagueModelConfig(scenario); const chat = scenario.chat as Record; const taskInstructions = scenario.taskInstructions as Record; @@ -144,10 +184,16 @@ async function simulateConversation( console.log(` Participant: ${participantMsg}`); // Colleague responds - const colleagueMessages = await callColleague(systemPrompt, messages); - const joined = colleagueMessages.join(' | '); - messages.push({ role: 'assistant', content: joined }); - console.log(` Colleague: ${joined}`); + const colleague = await callColleague(systemPrompt, messages, modelConfig); + const joined = colleague.messages.join(' | '); + messages.push({ + role: 'assistant', + content: joined, + latencyMs: colleague.latencyMs, + reasoningTokens: colleague.reasoningTokens, + }); + const slow = colleague.latencyMs > API_TIMEOUT_MS ? ' ⚠️ over budget' : ''; + console.log(` Colleague (${colleague.latencyMs}ms${slow}): ${joined}`); } return { @@ -193,7 +239,10 @@ async function main() { console.log('\nDone. Run judge.ts to evaluate the conversations.'); } -main().catch((err) => { - console.error(err); - process.exit(1); -}); +// Only run when executed directly, not when imported (e.g. by probe.ts). +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +}