AIToolsLab · kcarnold · Jun 26, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -2,7 +2,8 @@
   "permissions": {
     "allow": [
       "mcp__plugin_context7_context7__query-docs",
-      "mcp__plugin_context7_context7__resolve-library-id"
+      "mcp__plugin_context7_context7__resolve-library-id",
+      "mcp__backlog__task_view"
     ]
   }
 }
diff --git a/experiment/CLAUDE.md b/experiment/CLAUDE.md
@@ -78,7 +78,7 @@ The experiment supports multiple configurable scenarios. Each scenario includes
 - `lib/logging.ts` - Event logging utilities
 
 ### API Routes
-- `app/api/chat/route.ts` - Chat endpoint (GPT-5.2 with scenario-specific system prompt)
+- `app/api/chat/route.ts` - Chat endpoint (colleague model + reasoning effort come from the scenario config, with scenario-specific system prompt)
 - `app/api/writing-support/route.ts` - AI writing suggestions
 - `app/api/log/route.ts` - Event logging endpoint
 
@@ -93,14 +93,24 @@ Realistic timing works as follows: The colleague finds a moment to read your mes
 
 ### Adding New Scenarios
 
-To add a new scenario, edit `lib/studyConfig.ts` and add a new entry to the `SCENARIOS` object. Each scenario requires:
+To add a new scenario, edit `lib/scenarios.json` and add a new entry (`lib/studyConfig.ts` derives the typed `SCENARIOS` object from it). Each scenario requires:
 - **colleague**: name, firstName, role
 - **recipient**: name, email
 - **taskInstructions**: title, description, companyFraming
-- **chat**: initialMessages, followUpMessage, systemPrompt
+- **chat**: model, reasoningEffort, initialMessages, followUpMessage, systemPromptLines (joined into `systemPrompt` at runtime)
+- **analysis** (eval-only): context, keyFacts; optionally **chat.probes** (eval-only single-turn probes)
+
+The colleague **model and reasoning effort come from `chat.model` / `chat.reasoningEffort`** (read by both `app/api/chat/route.ts` and the validation pipeline — not hardcoded).
 
 Then pass the scenario ID via URL: `?scenario=yourScenarioId`
 
+### Scenario design & validation pipeline
+
+`scripts/scenario_design/` validates that a scenario's system prompt keeps the colleague in character
+(answers when asked, never volunteers info or drafts the email). `criteria.md` is the single source of
+truth for the behavioral criteria. See `scripts/scenario_design/README.md` for the phases
+(generate → simulate → judge → probe). These scripts make billable OpenAI calls.
+
 
 ## Getting Started
 

diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts
@@ -9,10 +9,13 @@ export async function POST(req: Request) {
   const scenario = getScenario(scenarioId);
 
   const result = streamText({
-    model: openai('gpt-5.2'),
+    model: openai(scenario.chat.model),
     system: scenario.chat.systemPrompt,
     messages: convertToModelMessages(messages),
     maxOutputTokens: 300,
+    providerOptions: {
+      openai: { reasoningEffort: scenario.chat.reasoningEffort },
+    },
   });
 
   return result.toUIMessageStreamResponse();

diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts
diff --git a/experiment/lib/scenarios.json b/experiment/lib/scenarios.json
@@ -19,12 +19,18 @@
       "companyFraming": "You're representing Meridian Events in this communication. Consider how your message will reflect on the team."
     },
     "chat": {
+      "model": "gpt-5.5",
+      "reasoningEffort": "low",
       "initialMessages": [
         "Problem with Jaden's panel tomorrow",
         "Room got double-booked. Gotta move him. But gotta keep him happy!",
         "I'm on a call, so need you to email him. What info do you need to sort this out?"
       ],
       "followUpMessage": "I got a sec between calls, what do you need to know?",
+      "probes": [
+        { "name": "Direct room question", "input": "What room should I tell them?", "criteria": ["answers_when_asked", "refusal_to_draft"] },
+        { "name": "Direct time question", "input": "What time works?", "criteria": ["answers_when_asked", "refusal_to_draft"] }
+      ],
       "systemPromptLines": [
         "You are Sarah Martinez, an Events Coordinator at Meridian Events. You are currently dealing with a stressful room double-booking situation.",
         "",
@@ -91,12 +97,18 @@
       "companyFraming": "You're representing Crestview Solutions in this communication. Consider how your message will reflect on our professionalism and reliability."
     },
     "chat": {
+      "model": "gpt-5.5",
+      "reasoningEffort": "low",
       "initialMessages": [
         "Hey, we have a problem with tomorrow's MediCore demo 😓",
         "Found a critical bug in the reporting module this morning. Can't show it like this to a VP.",
         "Can you email Dr. Patel and reschedule? Need to keep her confident in us. What info do you need?"
       ],
       "followUpMessage": "still here if you need anything",
+      "probes": [
+        { "name": "Direct slot question", "input": "What times can we offer her?", "criteria": ["answers_when_asked", "refusal_to_draft"] },
+        { "name": "Direct reason question", "input": "What's the reason I should give for rescheduling?", "criteria": ["answers_when_asked", "refusal_to_draft"] }
+      ],
       "systemPromptLines": [
         "You are Marcus Chen, a Solutions Engineer at Crestview Solutions. You've discovered a critical bug right before an important product demo.",
         "",

diff --git a/experiment/lib/studyConfig.ts b/experiment/lib/studyConfig.ts
@@ -74,12 +74,23 @@ export interface ScenarioConfig {
     companyFraming: string; // Company reputation reminder
   };
   chat: {
+    model: string;              // Colleague LLM model id (e.g. "gpt-5.5")
+    reasoningEffort: ReasoningEffort; // Reasoning effort for the colleague model
     initialMessages: string[];  // Opening messages from colleague
     followUpMessage: string;    // Proactive nudge if user doesn't engage
     systemPrompt: string;       // Full scenario context for AI
   };
 }
 
+// Reasoning effort levels accepted by the OpenAI provider (see AI SDK docs)
+export type ReasoningEffort =
+  | 'none'
+  | 'minimal'
+  | 'low'
+  | 'medium'
+  | 'high'
+  | 'xhigh';
+
 // Available scenarios (imported from JSON, cast to correct type)
 // The JSON includes an 'analysis' field for Python scripts that we exclude from the runtime type
 // The JSON stores systemPromptLines as an array for readability; we join them here