diff --git a/.claude/settings.json b/.claude/settings.json
index 9d03dcc9..324fafb9 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -2,7 +2,8 @@
   "permissions": {
     "allow": [
       "mcp__plugin_context7_context7__query-docs",
-      "mcp__plugin_context7_context7__resolve-library-id"
+      "mcp__plugin_context7_context7__resolve-library-id",
+      "mcp__backlog__task_view"
     ]
   }
 }
diff --git a/experiment/CLAUDE.md b/experiment/CLAUDE.md
index 36c66c81..fa8627b2 100644
--- a/experiment/CLAUDE.md
+++ b/experiment/CLAUDE.md
@@ -78,7 +78,7 @@ The experiment supports multiple configurable scenarios. Each scenario includes
 - `lib/logging.ts` - Event logging utilities
 
 ### API Routes
-- `app/api/chat/route.ts` - Chat endpoint (GPT-5.2 with scenario-specific system prompt)
+- `app/api/chat/route.ts` - Chat endpoint (colleague model + reasoning effort come from the scenario config, with scenario-specific system prompt)
 - `app/api/writing-support/route.ts` - AI writing suggestions
 - `app/api/log/route.ts` - Event logging endpoint
 
@@ -93,14 +93,24 @@ Realistic timing works as follows: The colleague finds a moment to read your mes
 
 ### Adding New Scenarios
 
-To add a new scenario, edit `lib/studyConfig.ts` and add a new entry to the `SCENARIOS` object. Each scenario requires:
+To add a new scenario, edit `lib/scenarios.json` and add a new entry (`lib/studyConfig.ts` derives the typed `SCENARIOS` object from it). Each scenario requires:
 - **colleague**: name, firstName, role
 - **recipient**: name, email
 - **taskInstructions**: title, description, companyFraming
-- **chat**: initialMessages, followUpMessage, systemPrompt
+- **chat**: model, reasoningEffort, initialMessages, followUpMessage, systemPromptLines (joined into `systemPrompt` at runtime)
+- **analysis** (eval-only): context, keyFacts; optionally **chat.probes** (eval-only single-turn probes)
+
+The colleague **model and reasoning effort come from `chat.model` / `chat.reasoningEffort`** (read by both `app/api/chat/route.ts` and the validation pipeline — not hardcoded).
 
 Then pass the scenario ID via URL: `?scenario=yourScenarioId`
 
+### Scenario design & validation pipeline
+
+`scripts/scenario_design/` validates that a scenario's system prompt keeps the colleague in character
+(answers when asked, never volunteers info or drafts the email). `criteria.md` is the single source of
+truth for the behavioral criteria. See `scripts/scenario_design/README.md` for the phases
+(generate → simulate → judge → probe). These scripts make billable OpenAI calls.
+
 
 ## Getting Started
 
diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts
index 0544bc61..37916831 100644
--- a/experiment/app/api/chat/route.ts
+++ b/experiment/app/api/chat/route.ts
@@ -9,10 +9,13 @@ export async function POST(req: Request) {
   const scenario = getScenario(scenarioId);
 
   const result = streamText({
-    model: openai('gpt-5.2'),
+    model: openai(scenario.chat.model),
     system: scenario.chat.systemPrompt,
     messages: convertToModelMessages(messages),
     maxOutputTokens: 300,
+    providerOptions: {
+      openai: { reasoningEffort: scenario.chat.reasoningEffort },
+    },
   });
 
   return result.toUIMessageStreamResponse();
diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts
deleted file mode 100644
index 07509e61..00000000
--- a/experiment/lib/eval/colleagueEval.ts
+++ /dev/null
@@ -1,130 +0,0 @@
-import { openai } from '@ai-sdk/openai';
-import { generateText } from 'ai';
-
-export interface EvalCriterion {
-  name: string;
-  prompt: string;
-  passCondition: 'yes' | 'no';
-}
-
-export interface EvalResult {
-  criterion: string;
-  pass: boolean;
-  answer: 'yes' | 'no';
-  reasoning: string;
-}
-
-export const EVAL_CRITERIA: Record<string, EvalCriterion> = {
-  noProactiveDump: {
-    name: 'No proactive info dump',
-    prompt:
-      'Did the colleague volunteer specific logistical details (room numbers, times, dates, specific availability) without being directly asked for them?',
-    passCondition: 'no',
-  },
-  refusesToDraft: {
-    name: 'Refuses to write email',
-    prompt: 'Did the colleague refuse or decline to write or draft the email for the user?',
-    passCondition: 'yes',
-  },
-  answersWhenAsked: {
-    name: 'Answers direct questions',
-    prompt: 'Did the colleague provide the specific information that was directly asked for?',
-    passCondition: 'yes',
-  },
-  noDraftLanguage: {
-    name: 'No draft language',
-    prompt:
-      "Did the colleague provide ready-to-use email text like 'Dear X, ...' or 'You could say: ...' or similar draft wording?",
-    passCondition: 'no',
-  },
-  staysInCharacter: {
-    name: 'Stays in character',
-    prompt:
-      'Did the colleague maintain a realistic workplace persona (brief messages, casual tone, appropriate for chat)?',
-    passCondition: 'yes',
-  },
-};
-
-export async function evalColleagueResponse(
-  testInput: string,
-  colleagueResponse: string,
-  criterion: EvalCriterion,
-  model: string = 'gpt-4o-mini'
-): Promise<EvalResult> {
-  const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario.
-
-User message: "${testInput}"
-Colleague response: "${colleagueResponse}"
-
-Question: ${criterion.prompt}
-
-Think step by step, then answer with ONLY a JSON object (no markdown, no code blocks):
-{"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`;
-
-  const result = await generateText({
-    model: openai(model),
-    prompt: judgePrompt,
-    maxOutputTokens: 150,
-  });
-
-  try {
-    const parsed = JSON.parse(result.text.trim());
-    return {
-      criterion: criterion.name,
-      pass: parsed.answer === criterion.passCondition,
-      answer: parsed.answer,
-      reasoning: parsed.reasoning,
-    };
-  } catch {
-    return {
-      criterion: criterion.name,
-      pass: false,
-      answer: 'no',
-      reasoning: `Failed to parse judge response: ${result.text}`,
-    };
-  }
-}
-
-export interface TestCase {
-  name: string;
-  input: string;
-  criteria: (keyof typeof EVAL_CRITERIA)[];
-}
-
-export const TEST_CASES: TestCase[] = [
-  {
-    name: 'Passive acknowledgment',
-    input: 'Ok, got it',
-    criteria: ['noProactiveDump'],
-  },
-  {
-    name: 'Simple acknowledgment',
-    input: 'Thanks',
-    criteria: ['noProactiveDump'],
-  },
-  {
-    name: 'Direct room question',
-    input: 'What room should I tell him?',
-    criteria: ['answersWhenAsked', 'noDraftLanguage'],
-  },
-  {
-    name: 'Direct time question',
-    input: 'What time works?',
-    criteria: ['answersWhenAsked', 'noDraftLanguage'],
-  },
-  {
-    name: 'Request to write email',
-    input: 'Can you write the email for me?',
-    criteria: ['refusesToDraft'],
-  },
-  {
-    name: 'Request for wording',
-    input: 'What should I say to him?',
-    criteria: ['noDraftLanguage', 'staysInCharacter'],
-  },
-  {
-    name: 'Vague follow-up',
-    input: 'Anything else I should know?',
-    criteria: ['noProactiveDump'],
-  },
-];
diff --git a/experiment/lib/scenarios.json b/experiment/lib/scenarios.json
index a1426992..4945e02b 100644
--- a/experiment/lib/scenarios.json
+++ b/experiment/lib/scenarios.json
@@ -19,12 +19,18 @@
       "companyFraming": "You're representing Meridian Events in this communication. Consider how your message will reflect on the team."
     },
     "chat": {
+      "model": "gpt-5.5",
+      "reasoningEffort": "low",
       "initialMessages": [
         "Problem with Jaden's panel tomorrow",
         "Room got double-booked. Gotta move him. But gotta keep him happy!",
         "I'm on a call, so need you to email him. What info do you need to sort this out?"
       ],
       "followUpMessage": "I got a sec between calls, what do you need to know?",
+      "probes": [
+        { "name": "Direct room question", "input": "What room should I tell them?", "criteria": ["answers_when_asked", "refusal_to_draft"] },
+        { "name": "Direct time question", "input": "What time works?", "criteria": ["answers_when_asked", "refusal_to_draft"] }
+      ],
       "systemPromptLines": [
         "You are Sarah Martinez, an Events Coordinator at Meridian Events. You are currently dealing with a stressful room double-booking situation.",
         "",
@@ -91,12 +97,18 @@
       "companyFraming": "You're representing Crestview Solutions in this communication. Consider how your message will reflect on our professionalism and reliability."
     },
     "chat": {
+      "model": "gpt-5.5",
+      "reasoningEffort": "low",
       "initialMessages": [
         "Hey, we have a problem with tomorrow's MediCore demo 😓",
         "Found a critical bug in the reporting module this morning. Can't show it like this to a VP.",
         "Can you email Dr. Patel and reschedule? Need to keep her confident in us. What info do you need?"
       ],
       "followUpMessage": "still here if you need anything",
+      "probes": [
+        { "name": "Direct slot question", "input": "What times can we offer her?", "criteria": ["answers_when_asked", "refusal_to_draft"] },
+        { "name": "Direct reason question", "input": "What's the reason I should give for rescheduling?", "criteria": ["answers_when_asked", "refusal_to_draft"] }
+      ],
       "systemPromptLines": [
         "You are Marcus Chen, a Solutions Engineer at Crestview Solutions. You've discovered a critical bug right before an important product demo.",
         "",
diff --git a/experiment/lib/studyConfig.ts b/experiment/lib/studyConfig.ts
index 05660c14..9116ef67 100644
--- a/experiment/lib/studyConfig.ts
+++ b/experiment/lib/studyConfig.ts
@@ -74,12 +74,23 @@ export interface ScenarioConfig {
     companyFraming: string; // Company reputation reminder
   };
   chat: {
+    model: string;              // Colleague LLM model id (e.g. "gpt-5.5")
+    reasoningEffort: ReasoningEffort; // Reasoning effort for the colleague model
     initialMessages: string[];  // Opening messages from colleague
     followUpMessage: string;    // Proactive nudge if user doesn't engage
     systemPrompt: string;       // Full scenario context for AI
   };
 }
 
+// Reasoning effort levels accepted by the OpenAI provider (see AI SDK docs)
+export type ReasoningEffort =
+  | 'none'
+  | 'minimal'
+  | 'low'
+  | 'medium'
+  | 'high'
+  | 'xhigh';
+
 // Available scenarios (imported from JSON, cast to correct type)
 // The JSON includes an 'analysis' field for Python scripts that we exclude from the runtime type
 // The JSON stores systemPromptLines as an array for readability; we join them here
diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts
deleted file mode 100644
index f3f59e61..00000000
--- a/experiment/scripts/evalColleague.ts
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Colleague Behavior Eval Script
- *
- * Tests that the colleague LLM behaves correctly:
- * - Doesn't volunteer information proactively
- * - Answers questions when asked
- * - Refuses to draft emails
- * - Stays in character
- *
- * Usage:
- *   npx tsx scripts/evalColleague.ts [scenario]
- *
- * Examples:
- *   npx tsx scripts/evalColleague.ts                    # Run all scenarios
- *   npx tsx scripts/evalColleague.ts roomDoubleBooking  # Run specific scenario
- */
-
-import { openai } from '@ai-sdk/openai';
-import { generateText } from 'ai';
-import { SCENARIOS } from '../lib/studyConfig';
-import {
-  EVAL_CRITERIA,
-  TEST_CASES,
-  evalColleagueResponse,
-  type EvalResult,
-} from '../lib/eval/colleagueEval';
-
-interface ColleagueResponse {
-  messages: string[];
-  raw: string;
-}
-
-async function callColleague(
-  systemPrompt: string,
-  userMessage: string,
-  conversationHistory: Array<{ role: 'user' | 'assistant'; content: string }> = []
-): Promise<ColleagueResponse> {
-  const messages = [
-    ...conversationHistory.map((m) => ({ role: m.role, content: m.content })),
-    { role: 'user' as const, content: userMessage },
-  ];
-
-  const result = await generateText({
-    model: openai('gpt-5.2'),
-    system: systemPrompt,
-    messages,
-    maxOutputTokens: 300,
-  });
-
-  const raw = result.text.trim();
-
-  // Parse JSON array response
-  try {
-    const parsed = JSON.parse(raw);
-    if (Array.isArray(parsed)) {
-      return { messages: parsed, raw };
-    }
-    return { messages: [raw], raw };
-  } catch {
-    return { messages: [raw], raw };
-  }
-}
-
-interface TestResult {
-  testCase: string;
-  input: string;
-  colleagueResponse: string;
-  evals: EvalResult[];
-  allPassed: boolean;
-}
-
-async function runScenarioEval(scenarioId: string): Promise<TestResult[]> {
-  const scenario = SCENARIOS[scenarioId as keyof typeof SCENARIOS];
-  if (!scenario) {
-    throw new Error(`Unknown scenario: ${scenarioId}`);
-  }
-
-  console.log(`\n${'='.repeat(60)}`);
-  console.log(`Scenario: ${scenarioId}`);
-  console.log(`Colleague: ${scenario.colleague.name} (${scenario.colleague.role})`);
-  console.log(`${'='.repeat(60)}\n`);
-
-  const results: TestResult[] = [];
-
-  // Build initial conversation context from the scenario's initial messages
-  const conversationHistory: Array<{ role: 'user' | 'assistant'; content: string }> = [];
-  for (const msg of scenario.chat.initialMessages) {
-    conversationHistory.push({ role: 'assistant', content: msg });
-  }
-
-  for (const testCase of TEST_CASES) {
-    console.log(`Test: ${testCase.name}`);
-    console.log(`  Input: "${testCase.input}"`);
-
-    // Call the colleague
-    const colleagueResponse = await callColleague(
-      scenario.chat.systemPrompt,
-      testCase.input,
-      conversationHistory
-    );
-
-    const responseText = colleagueResponse.messages.join(' | ');
-    console.log(`  Response: "${responseText}"`);
-
-    // Run evals for this test case
-    const evals: EvalResult[] = [];
-    for (const criterionKey of testCase.criteria) {
-      const criterion = EVAL_CRITERIA[criterionKey];
-      const evalResult = await evalColleagueResponse(testCase.input, responseText, criterion);
-      evals.push(evalResult);
-
-      const icon = evalResult.pass ? '✓' : '✗';
-      console.log(`  ${icon} ${evalResult.criterion}: ${evalResult.reasoning}`);
-    }
-
-    const allPassed = evals.every((e) => e.pass);
-    results.push({
-      testCase: testCase.name,
-      input: testCase.input,
-      colleagueResponse: responseText,
-      evals,
-      allPassed,
-    });
-
-    console.log('');
-  }
-
-  return results;
-}
-
-function printSummary(allResults: Map<string, TestResult[]>) {
-  console.log('\n' + '='.repeat(60));
-  console.log('SUMMARY');
-  console.log('='.repeat(60) + '\n');
-
-  let totalTests = 0;
-  let totalPassed = 0;
-
-  for (const [scenarioId, results] of allResults) {
-    const passed = results.filter((r) => r.allPassed).length;
-    const total = results.length;
-    totalTests += total;
-    totalPassed += passed;
-
-    const icon = passed === total ? '✓' : '✗';
-    console.log(`${icon} ${scenarioId}: ${passed}/${total} tests passed`);
-
-    // Show failures
-    for (const result of results) {
-      if (!result.allPassed) {
-        console.log(`    ✗ ${result.testCase}`);
-        for (const evalResult of result.evals) {
-          if (!evalResult.pass) {
-            console.log(`      - ${evalResult.criterion}: ${evalResult.reasoning}`);
-          }
-        }
-      }
-    }
-  }
-
-  console.log('');
-  console.log(`Total: ${totalPassed}/${totalTests} tests passed`);
-
-  return totalPassed === totalTests;
-}
-
-async function main() {
-  const args = process.argv.slice(2);
-  const specificScenario = args[0];
-
-  const scenariosToTest = specificScenario
-    ? [specificScenario]
-    : Object.keys(SCENARIOS);
-
-  const allResults = new Map<string, TestResult[]>();
-
-  for (const scenarioId of scenariosToTest) {
-    try {
-      const results = await runScenarioEval(scenarioId);
-      allResults.set(scenarioId, results);
-    } catch (error) {
-      console.error(`Error testing ${scenarioId}:`, error);
-    }
-  }
-
-  const allPassed = printSummary(allResults);
-  process.exit(allPassed ? 0 : 1);
-}
-
-main().catch(console.error);
diff --git a/experiment/scripts/scenario_design/README.md b/experiment/scripts/scenario_design/README.md
new file mode 100644
index 00000000..4e353aad
--- /dev/null
+++ b/experiment/scripts/scenario_design/README.md
@@ -0,0 +1,65 @@
+# Scenario design & validation pipeline
+
+Tools for authoring and validating the simulated-colleague scenarios used in the study. The colleague
+is a *measurement instrument*: it must answer when asked but never volunteer information or draft the
+email (see `../../CLAUDE.md`). This pipeline checks that a scenario's system prompt actually enforces
+that behavior.
+
+> ⚠️ These scripts make **billable OpenAI API calls**. They need `OPENAI_API_KEY` in
+> `experiment/.env.local`. Run them from the `experiment/` directory.
+
+## Single source of truth
+
+- **`criteria.md`** — the scenario-agnostic behavioral criteria (8 of them). `judge.ts` and `probe.ts`
+  parse this file directly; there is no second copy of the criteria in code. Each criterion's slug is
+  its title lowercased with non-alphanumerics replaced by `_` (e.g. "Information Gating" →
+  `information_gating`).
+- **`../../lib/scenarios.json`** — the scenarios themselves. The **colleague model and reasoning
+  effort live here** (`chat.model`, `chat.reasoningEffort`), so the live study (`app/api/chat/route.ts`)
+  and this pipeline test the *same thing*. Eval-only fields (`analysis`, `chat.probes`) live here too
+  and are ignored by the runtime app.
+
+## Phases
+
+```
+generate.ts → simulate.ts → judge.ts
+                          ↘ probe.ts
+```
+
+| Script | Purpose | Command |
+|---|---|---|
+| `generate.ts` | Draft a scenario JSON from a plain-English situation file | `npx tsx scripts/scenario_design/generate.ts <situation.md> <scenario-id>` |
+| `simulate.ts` | Run multi-turn conversations between 4 participant archetypes and the colleague | `npx tsx scripts/scenario_design/simulate.ts <scenario-id> [archetype-id]` |
+| `judge.ts` | Score each simulated conversation against every criterion in `criteria.md` | `npx tsx scripts/scenario_design/judge.ts <scenario-id> [archetype-id]` |
+| `probe.ts` | Fast single-turn adversarial checks against targeted criteria + latency budget | `npx tsx scripts/scenario_design/probe.ts <scenario-id> [probe-name]` |
+
+Models: the **colleague** uses the model/reasoning from the scenario config; the **participant
+simulator** and the **judge** use `gpt-4o`.
+
+### Probes (`probe.ts`)
+
+`probe.ts` replaces the old `scripts/evalColleague.ts`. Each probe seeds the conversation with the
+scenario's opening messages, sends one participant message, and judges the reply against only the
+criteria that probe targets. It also asserts the reply returned within the production latency budget
+(`API_TIMEOUT_MS`, currently 20s — the same timeout the live app aborts at).
+
+- **Generic probes** (in `probe.ts`) are scenario-agnostic: acknowledgments that must not trigger an
+  info dump, draft requests that must be refused, vague follow-ups, etc.
+- **Scenario-specific probes** live in `scenarios.json` under `chat.probes` — the answerable fact
+  questions that exercise `answers_when_asked` (these vary per scenario). Shape:
+  `{ "name": "...", "input": "...", "criteria": ["answers_when_asked", "refusal_to_draft"] }`.
+
+## Fixing failures
+
+`judge.ts` and `probe.ts` write agent-readable result files to `outputs/`
+(`<scenario-id>_judgments.json`, `<scenario-id>_probes.json`) where each failure carries `evidence`
+and `concern`. To fix, point a coding agent at those files and have it revise the scenario's
+`systemPromptLines` in `scenarios.json`, then re-run the relevant phase.
+
+## Files
+
+- `criteria.md` — behavioral criteria (single source of truth)
+- `archetypes.ts` — the 4 participant personas used by `simulate.ts`
+- `generate.ts`, `simulate.ts`, `judge.ts`, `probe.ts` — the phases above
+- `situations/` — plain-English situation inputs for `generate.ts`
+- `outputs/` — generated scenarios, conversation logs, and judgments (git-ignored working dir)
diff --git a/experiment/scripts/scenario_design/fix.ts b/experiment/scripts/scenario_design/fix.ts
deleted file mode 100644
index 6363cd7c..00000000
--- a/experiment/scripts/scenario_design/fix.ts
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * Phase 4: Analyze judgment failures and propose systemPrompt fixes.
- *
- * Reads the judgment results and the scenario, then asks the LLM to diagnose
- * why criteria failed and propose minimal edits to the systemPrompt.
- *
- * Usage:
- *   npx tsx scripts/scenario_design/fix.ts <scenario-id>
- *
- * Input: scripts/scenario_design/outputs/<scenario-id>_judgments.json
- *        + the scenario JSON (from outputs/ or scenarios.json)
- * Output: proposed changes printed to stdout
- */
-
-import { openai } from '@ai-sdk/openai';
-import { generateText } from 'ai';
-import { readFileSync, existsSync } from 'fs';
-import { resolve } from 'path';
-import scenariosData from '../../lib/scenarios.json';
-
-const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs');
-
-function loadScenario(scenarioId: string): Record<string, unknown> {
-  const generatedPath = resolve(OUTPUTS_DIR, `${scenarioId}.json`);
-  if (existsSync(generatedPath)) {
-    return JSON.parse(readFileSync(generatedPath, 'utf-8'));
-  }
-  const builtin = scenariosData[scenarioId as keyof typeof scenariosData];
-  if (builtin) return builtin as unknown as Record<string, unknown>;
-  throw new Error(`Scenario "${scenarioId}" not found`);
-}
-
-function getSystemPromptLines(scenario: Record<string, unknown>): string[] {
-  const chat = scenario.chat as Record<string, unknown>;
-  if (Array.isArray(chat.systemPromptLines)) return chat.systemPromptLines as string[];
-  if (typeof chat.systemPrompt === 'string') return (chat.systemPrompt as string).split('\n');
-  throw new Error('No systemPrompt found in scenario');
-}
-
-interface Verdict {
-  criterionId: string;
-  criterionTitle: string;
-  pass: boolean;
-  evidence: string;
-  concern: string;
-}
-
-async function main() {
-  const args = process.argv.slice(2);
-  if (args.length < 1) {
-    console.error('Usage: npx tsx scripts/scenario_design/fix.ts <scenario-id>');
-    process.exit(1);
-  }
-
-  const scenarioId = args[0];
-
-  // Load judgments
-  const judgmentsPath = resolve(OUTPUTS_DIR, `${scenarioId}_judgments.json`);
-  if (!existsSync(judgmentsPath)) {
-    console.error(`No judgments found. Run judge.ts first.`);
-    process.exit(1);
-  }
-  const judgments: Record<string, Verdict[]> = JSON.parse(readFileSync(judgmentsPath, 'utf-8'));
-
-  // Collect failures
-  const failures: Array<{ archetype: string; criterion: string; evidence: string; concern: string }> = [];
-  for (const [archetypeId, verdicts] of Object.entries(judgments)) {
-    for (const v of verdicts) {
-      if (!v.pass) {
-        failures.push({
-          archetype: archetypeId,
-          criterion: v.criterionTitle,
-          evidence: v.evidence,
-          concern: v.concern,
-        });
-      }
-    }
-  }
-
-  if (failures.length === 0) {
-    console.log('No failures found — nothing to fix!');
-    process.exit(0);
-  }
-
-  console.log(`Found ${failures.length} failure(s). Analyzing...\n`);
-
-  // Load the current systemPrompt
-  const scenario = loadScenario(scenarioId);
-  const promptLines = getSystemPromptLines(scenario);
-  const currentPrompt = promptLines.map((line, i) => `${String(i + 1).padStart(3)}: ${line}`).join('\n');
-
-  // Load conversation logs for failed archetypes
-  const failedArchetypes = [...new Set(failures.map((f) => f.archetype))];
-  const conversationExcerpts: string[] = [];
-  for (const archetypeId of failedArchetypes) {
-    const logPath = resolve(OUTPUTS_DIR, `${scenarioId}_${archetypeId}.json`);
-    if (existsSync(logPath)) {
-      const log = JSON.parse(readFileSync(logPath, 'utf-8'));
-      const transcript = log.messages
-        .map((m: { role: string; content: string }) =>
-          `${m.role === 'user' ? 'Participant' : 'Colleague'}: ${m.content}`)
-        .join('\n');
-      conversationExcerpts.push(`--- ${archetypeId} ---\n${transcript}`);
-    }
-  }
-
-  const result = await generateText({
-    model: openai('gpt-4o'),
-    prompt: `You are helping improve an AI colleague's system prompt for a research study.
-
-CURRENT SYSTEM PROMPT (line numbers for reference):
-${currentPrompt}
-
-FAILURES:
-${failures.map((f) => `- [${f.archetype}] ${f.criterion}: ${f.concern} (evidence: "${f.evidence}")`).join('\n')}
-
-RELEVANT CONVERSATIONS:
-${conversationExcerpts.join('\n\n')}
-
-Analyze why these failures happened and propose MINIMAL edits to the system prompt.
-For each proposed change:
-1. Identify the root cause
-2. Specify which line(s) to change
-3. Show the exact before/after text
-4. Explain why this fix addresses the failure without breaking other criteria
-
-Be conservative — prefer adding a clarifying phrase over rewriting sections.
-Do NOT add new sections or restructure the prompt.`,
-    maxOutputTokens: 2000,
-  });
-
-  console.log(result.text);
-}
-
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
diff --git a/experiment/scripts/scenario_design/judge.ts b/experiment/scripts/scenario_design/judge.ts
index 22f64bd3..a4133f16 100644
--- a/experiment/scripts/scenario_design/judge.ts
+++ b/experiment/scripts/scenario_design/judge.ts
@@ -20,13 +20,20 @@ import { generateObject } from 'ai';
 import { z } from 'zod';
 import { readFileSync, writeFileSync, readdirSync } from 'fs';
 import { resolve } from 'path';
+import { fileURLToPath } from 'node:url';
 
 const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs');
 
+export interface Criterion {
+  id: string;
+  title: string;
+  description: string;
+}
+
 // Criteria loaded from markdown — parsed into id/description pairs
-function loadCriteria(): Array<{ id: string; title: string; description: string }> {
+export function loadCriteria(): Criterion[] {
   const raw = readFileSync(resolve(import.meta.dirname, 'criteria.md'), 'utf-8');
-  const criteria: Array<{ id: string; title: string; description: string }> = [];
+  const criteria: Criterion[] = [];
 
   // Parse "## N. Title\n\nDescription..." sections
   const sections = raw.split(/^## /m).slice(1);
@@ -50,7 +57,7 @@ const verdictSchema = z.object({
   concern: z.string().describe('If fail: what went wrong. If pass: empty string.'),
 });
 
-interface Verdict {
+export interface Verdict {
   criterionId: string;
   criterionTitle: string;
   pass: boolean;
@@ -58,16 +65,16 @@ interface Verdict {
   concern: string;
 }
 
-interface ConversationLog {
+export interface ConversationLog {
   scenarioId: string;
   archetypeId: string;
   archetypeName: string;
   messages: Array<{ role: string; content: string }>;
 }
 
-async function judgeConversation(
+export async function judgeConversation(
   log: ConversationLog,
-  criterion: { id: string; title: string; description: string },
+  criterion: Criterion,
 ): Promise<Verdict> {
   const transcript = log.messages
     .map((m) => `${m.role === 'user' ? 'Participant' : 'Colleague'}: ${m.content}`)
@@ -169,13 +176,19 @@ async function main() {
   console.log(`Detailed results: ${outPath}`);
 
   if (totalFailures > 0) {
-    console.log('\nRun fix.ts to analyze failures and propose systemPrompt changes.');
+    console.log(
+      `\nTo fix: point a coding agent at ${outPath} (each failure has evidence + concern) ` +
+        'and have it revise the scenario systemPromptLines. Instruct it to come up with testable hypotheses about what went wrong and how to fix it.',
+    );
   }
 
   process.exit(totalFailures > 0 ? 1 : 0);
 }
 
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
+// Only run when executed directly, not when imported (e.g. by probe.ts).
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}
diff --git a/experiment/scripts/scenario_design/probe.ts b/experiment/scripts/scenario_design/probe.ts
new file mode 100644
index 00000000..7fcf9373
--- /dev/null
+++ b/experiment/scripts/scenario_design/probe.ts
@@ -0,0 +1,198 @@
+/**
+ * Phase: Single-turn probes.
+ *
+ * Fast, deterministic adversarial checks. Each probe seeds the conversation with the
+ * scenario's opening messages, sends ONE participant message, and judges the colleague's
+ * reply against only the criteria that probe targets (slugs from criteria.md). Also checks
+ * that the reply came back within the production latency budget (API_TIMEOUT_MS).
+ *
+ * Replaces the old scripts/evalColleague.ts single-turn eval, now built on the same
+ * scenario config, criteria.md, and judge as the rest of the pipeline.
+ *
+ * Usage:
+ *   npx tsx scripts/scenario_design/probe.ts <scenario-id> [probe-name]
+ *
+ * Examples:
+ *   npx tsx scripts/scenario_design/probe.ts roomDoubleBooking
+ *   npx tsx scripts/scenario_design/probe.ts roomDoubleBooking "Thanks"
+ *
+ * Output: scripts/scenario_design/outputs/<scenario-id>_probes.json
+ */
+
+import { writeFileSync, mkdirSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { API_TIMEOUT_MS } from '../../lib/studyConfig';
+import {
+  loadScenario,
+  getSystemPrompt,
+  getColleagueModelConfig,
+  callColleague,
+  type Message,
+} from './simulate';
+import { loadCriteria, judgeConversation, type ConversationLog, type Verdict } from './judge';
+
+const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs');
+
+interface Probe {
+  name: string;
+  input: string;
+  criteria: string[]; // criteria.md slugs (see loadCriteria)
+}
+
+// Scenario-agnostic probes — these hold for any email-writing scenario.
+const GENERIC_PROBES: Probe[] = [
+  { name: 'Passive acknowledgment', input: 'Ok, got it', criteria: ['information_gating'] },
+  { name: 'Simple acknowledgment', input: 'Thanks', criteria: ['information_gating'] },
+  {
+    name: 'Request to write email',
+    input: 'Can you write the email for me?',
+    criteria: ['refusal_to_draft'],
+  },
+  {
+    name: 'Request for wording',
+    input: 'What should I say?',
+    criteria: ['refusal_to_draft', 'tone_and_character'],
+  },
+  { name: 'Vague follow-up', input: 'Anything else I should know?', criteria: ['information_gating'] },
+];
+
+interface ProbeResult {
+  name: string;
+  input: string;
+  response: string;
+  latencyMs: number;
+  reasoningTokens?: number;
+  latencyPass: boolean;
+  verdicts: Verdict[];
+  pass: boolean;
+}
+
+function getScenarioProbes(scenario: Record<string, unknown>): Probe[] {
+  const chat = (scenario.chat ?? {}) as Record<string, unknown>;
+  const raw = Array.isArray(chat.probes) ? (chat.probes as Array<Record<string, unknown>>) : [];
+  return raw.map((p) => ({
+    name: typeof p.name === 'string' ? p.name : String(p.input),
+    input: String(p.input),
+    criteria: Array.isArray(p.criteria) ? (p.criteria as string[]) : [],
+  }));
+}
+
+async function runProbe(
+  scenarioId: string,
+  scenario: Record<string, unknown>,
+  probe: Probe,
+  criteriaById: Map<string, ReturnType<typeof loadCriteria>[number]>,
+): Promise<ProbeResult> {
+  const systemPrompt = getSystemPrompt(scenario);
+  const modelConfig = getColleagueModelConfig(scenario);
+  const chat = scenario.chat as Record<string, unknown>;
+
+  // Seed with the colleague's opening messages, then the single probe message.
+  const messages: Message[] = (chat.initialMessages as string[]).map((content) => ({
+    role: 'assistant' as const,
+    content,
+  }));
+  messages.push({ role: 'user', content: probe.input });
+
+  const colleague = await callColleague(systemPrompt, messages, modelConfig);
+  const response = colleague.messages.join(' | ');
+  messages.push({ role: 'assistant', content: response });
+
+  const log: ConversationLog = {
+    scenarioId,
+    archetypeId: 'probe',
+    archetypeName: `Probe: ${probe.name}`,
+    messages: messages.map((m) => ({ role: m.role, content: m.content })),
+  };
+
+  const verdicts: Verdict[] = [];
+  for (const slug of probe.criteria) {
+    const criterion = criteriaById.get(slug);
+    if (!criterion) {
+      console.warn(`  ! Unknown criterion slug "${slug}" (not in criteria.md) — skipping`);
+      continue;
+    }
+    verdicts.push(await judgeConversation(log, criterion));
+  }
+
+  const latencyPass = colleague.latencyMs <= API_TIMEOUT_MS;
+  const pass = latencyPass && verdicts.every((v) => v.pass);
+
+  return {
+    name: probe.name,
+    input: probe.input,
+    response,
+    latencyMs: colleague.latencyMs,
+    reasoningTokens: colleague.reasoningTokens,
+    latencyPass,
+    verdicts,
+    pass,
+  };
+}
+
+async function main() {
+  const args = process.argv.slice(2);
+  if (args.length < 1) {
+    console.error('Usage: npx tsx scripts/scenario_design/probe.ts <scenario-id> [probe-name]');
+    process.exit(1);
+  }
+
+  const scenarioId = args[0];
+  const probeFilter = args[1];
+  const scenario = loadScenario(scenarioId);
+
+  const criteria = loadCriteria();
+  const criteriaById = new Map(criteria.map((c) => [c.id, c]));
+
+  let probes = [...GENERIC_PROBES, ...getScenarioProbes(scenario)];
+  if (probeFilter) {
+    probes = probes.filter((p) => p.name === probeFilter || p.input === probeFilter);
+    if (probes.length === 0) {
+      console.error(`No probe matching "${probeFilter}"`);
+      process.exit(1);
+    }
+  }
+
+  const modelConfig = getColleagueModelConfig(scenario);
+  console.log(
+    `Probing "${scenarioId}" with ${probes.length} probe(s) ` +
+      `(model ${modelConfig.model}, reasoning ${modelConfig.reasoningEffort}, budget ${API_TIMEOUT_MS}ms)\n`,
+  );
+
+  mkdirSync(OUTPUTS_DIR, { recursive: true });
+
+  const results: ProbeResult[] = [];
+  for (const probe of probes) {
+    const result = await runProbe(scenarioId, scenario, probe, criteriaById);
+    results.push(result);
+
+    const latencyIcon = result.latencyPass ? '' : ' ⚠️ OVER BUDGET';
+    console.log(`${result.pass ? '✓' : '✗'} ${result.name} (${result.latencyMs}ms${latencyIcon})`);
+    console.log(`    input: "${result.input}"`);
+    console.log(`    reply: "${result.response}"`);
+    for (const v of result.verdicts) {
+      console.log(`    ${v.pass ? '✓' : '✗'} ${v.criterionTitle}${v.concern ? ': ' + v.concern : ''}`);
+    }
+    console.log('');
+  }
+
+  // Summary
+  const passed = results.filter((r) => r.pass).length;
+  console.log('='.repeat(60));
+  console.log(`${passed}/${results.length} probes passed`);
+  const latencyFails = results.filter((r) => !r.latencyPass);
+  if (latencyFails.length > 0) {
+    console.log(`${latencyFails.length} probe(s) exceeded the ${API_TIMEOUT_MS}ms latency budget.`);
+  }
+
+  const outPath = resolve(OUTPUTS_DIR, `${scenarioId}_probes.json`);
+  writeFileSync(outPath, JSON.stringify(results, null, 2) + '\n');
+  console.log(`Detailed results: ${outPath}`);
+
+  process.exit(passed === results.length ? 0 : 1);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/experiment/scripts/scenario_design/simulate.ts b/experiment/scripts/scenario_design/simulate.ts
index 5db50b4c..2fdc7848 100644
--- a/experiment/scripts/scenario_design/simulate.ts
+++ b/experiment/scripts/scenario_design/simulate.ts
@@ -19,15 +19,34 @@ import { openai } from '@ai-sdk/openai';
 import { generateText } from 'ai';
 import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
 import { resolve } from 'path';
+import { fileURLToPath } from 'node:url';
 import { ARCHETYPES } from './archetypes';
 import scenariosData from '../../lib/scenarios.json';
+import { API_TIMEOUT_MS } from '../../lib/studyConfig';
 
 const OUTPUTS_DIR = resolve(import.meta.dirname, 'outputs');
 const MAX_TURNS = 8;
 
-interface Message {
+// Defaults if a (possibly generated) scenario omits the colleague model config.
+const DEFAULT_MODEL = 'gpt-5.5';
+const DEFAULT_REASONING_EFFORT = 'low';
+
+export interface Message {
   role: 'user' | 'assistant';
   content: string;
+  latencyMs?: number;       // wall-clock time for this colleague turn
+  reasoningTokens?: number; // reasoning tokens reported by the provider
+}
+
+export interface ColleagueModelConfig {
+  model: string;
+  reasoningEffort: string;
+}
+
+export interface ColleagueResult {
+  messages: string[];
+  latencyMs: number;
+  reasoningTokens?: number;
 }
 
 interface ConversationLog {
@@ -37,7 +56,7 @@ interface ConversationLog {
   messages: Message[];
 }
 
-function loadScenario(scenarioId: string) {
+export function loadScenario(scenarioId: string) {
   // Try outputs/ first (generated scenario), then fall back to scenarios.json
   const generatedPath = resolve(OUTPUTS_DIR, `${scenarioId}.json`);
   if (existsSync(generatedPath)) {
@@ -54,7 +73,7 @@ function loadScenario(scenarioId: string) {
   throw new Error(`Scenario "${scenarioId}" not found in outputs/ or scenarios.json`);
 }
 
-function getSystemPrompt(scenario: Record<string, unknown>): string {
+export function getSystemPrompt(scenario: Record<string, unknown>): string {
   const chat = scenario.chat as Record<string, unknown>;
   if (Array.isArray(chat.systemPromptLines)) {
     return (chat.systemPromptLines as string[]).join('\n');
@@ -65,23 +84,43 @@ function getSystemPrompt(scenario: Record<string, unknown>): string {
   throw new Error('Scenario has neither systemPromptLines nor systemPrompt');
 }
 
-async function callColleague(
+// Read the colleague model + reasoning effort from the scenario, falling back to
+// defaults for older/generated scenarios that predate these fields.
+export function getColleagueModelConfig(scenario: Record<string, unknown>): ColleagueModelConfig {
+  const chat = (scenario.chat ?? {}) as Record<string, unknown>;
+  return {
+    model: typeof chat.model === 'string' ? chat.model : DEFAULT_MODEL,
+    reasoningEffort:
+      typeof chat.reasoningEffort === 'string' ? chat.reasoningEffort : DEFAULT_REASONING_EFFORT,
+  };
+}
+
+export async function callColleague(
   systemPrompt: string,
   history: Message[],
-): Promise<string[]> {
+  modelConfig: ColleagueModelConfig,
+): Promise<ColleagueResult> {
+  const start = Date.now();
   const result = await generateText({
-    model: openai('gpt-5.2'),
+    model: openai(modelConfig.model),
     system: systemPrompt,
     messages: history.map((m) => ({ role: m.role, content: m.content })),
     maxOutputTokens: 300,
+    providerOptions: {
+      openai: { reasoningEffort: modelConfig.reasoningEffort },
+    },
   });
+  const latencyMs = Date.now() - start;
+  const reasoningTokens = result.providerMetadata?.openai?.reasoningTokens as number | undefined;
 
   const raw = result.text.trim();
+  let messages: string[] = [raw];
   try {
     const parsed = JSON.parse(raw);
-    if (Array.isArray(parsed)) return parsed;
+    if (Array.isArray(parsed)) messages = parsed;
   } catch { /* fall through */ }
-  return [raw];
+
+  return { messages, latencyMs, reasoningTokens };
 }
 
 async function callParticipant(
@@ -119,6 +158,7 @@ async function simulateConversation(
   archetype: typeof ARCHETYPES[number],
 ): Promise<ConversationLog> {
   const systemPrompt = getSystemPrompt(scenario);
+  const modelConfig = getColleagueModelConfig(scenario);
   const chat = scenario.chat as Record<string, unknown>;
   const taskInstructions = scenario.taskInstructions as Record<string, string>;
 
@@ -144,10 +184,16 @@ async function simulateConversation(
     console.log(`  Participant: ${participantMsg}`);
 
     // Colleague responds
-    const colleagueMessages = await callColleague(systemPrompt, messages);
-    const joined = colleagueMessages.join(' | ');
-    messages.push({ role: 'assistant', content: joined });
-    console.log(`  Colleague: ${joined}`);
+    const colleague = await callColleague(systemPrompt, messages, modelConfig);
+    const joined = colleague.messages.join(' | ');
+    messages.push({
+      role: 'assistant',
+      content: joined,
+      latencyMs: colleague.latencyMs,
+      reasoningTokens: colleague.reasoningTokens,
+    });
+    const slow = colleague.latencyMs > API_TIMEOUT_MS ? ' ⚠️ over budget' : '';
+    console.log(`  Colleague (${colleague.latencyMs}ms${slow}): ${joined}`);
   }
 
   return {
@@ -193,7 +239,10 @@ async function main() {
   console.log('\nDone. Run judge.ts to evaluate the conversations.');
 }
 
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
+// Only run when executed directly, not when imported (e.g. by probe.ts).
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}