PolicyEngine · anth-volk · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/backend/agent_tools.py b/backend/agent_tools.py
diff --git a/backend/prompts.py b/backend/prompts.py
@@ -14,19 +14,26 @@
 """
 
 PYTHON_COMPUTATION_RULES = """
-CRITICAL - ALWAYS COMPUTE WITH PYTHON:
+CRITICAL - ALWAYS COMPUTE WITH TOOLS:
 - Never answer quantitative policy questions from memory.
-- You have one execution tool: `run_python`.
-- Use `run_python` for every tax, benefit, reform, schedule, poverty, decile,
-  and distributional question.
-- Every number in your answer must come directly from the Python result you
-  just computed.
+- Every number in your answer must come directly from a tool result you just
+  computed.
+- Prefer the typed calculation tools when the question fits their shape:
+  `calculate_household` for illustrative household-level questions,
+  `run_economy_simulation` for society-wide reform analysis, and
+  `analyse_microdata` for allowed non-FRS microdata analysis.
+- Use `validate_reform` when the user is drafting, debugging, or asking
+  whether parametric reform JSON is valid. Do not call it as a routine
+  preflight before every simulation; calculation tools validate internally.
+- Use `run_python` as the fallback for structural reforms, parameter
+  introspection, historical lookups, novel aggregations, or cases the typed
+  tools cannot express.
 """
 
 MODEL_INSTRUCTIONS_RULES = """
 CRITICAL - START BY READING THE MODEL INSTRUCTIONS:
-- At the start of a new line of analysis, use Python to inspect
-  `capabilities()`.
+- When using `run_python` at the start of a new line of analysis, inspect
+  `capabilities()` first.
 - Use that to ground yourself in the available datasets, years, programmes,
   and caveats before you simulate.
 - If the user asks about something outside the modelled scope, say so clearly
@@ -65,6 +72,10 @@
   or real households.
 - Use aggregate microdata interfaces only for aggregate outputs; do not inspect
   or return individual survey rows as examples.
+- `analyse_microdata` must not be used with FRS. For FRS, use aggregate outputs
+  such as `run_economy_simulation`.
+- If `analyse_microdata` returns non-FRS sample records, describe them as
+  model records, not real households or actual survey rows.
 - If the user asks how individual households are constructed in the data, what
   households in the data look like, or for examples of actual household records,
   explain that this app cannot access or disclose real households.
@@ -123,7 +134,7 @@
 CHARTS:
 - When a visualisation would help (distributions, marginal-rate or tax-schedule
   curves, decile comparisons, trends), call the `generate_chart` tool after you
-  have the data from `run_python`.
+  have the data from a typed calculation tool or `run_python`.
 - The tool returns a `chart_markdown` field containing a ```chart fenced JSON
   block. Paste that block VERBATIM into your next text response - the frontend
   parses it to render the chart. If you do not include it, no chart will

diff --git a/backend/routes/chatbot.py b/backend/routes/chatbot.py
@@ -54,6 +54,7 @@
 SUGGESTION_MODEL = os.environ.get("ANTHROPIC_SUGGESTION_MODEL", DEFAULT_FAST_MODEL)
 SUGGESTION_TIMEOUT_SECS = float(os.environ.get("ANTHROPIC_SUGGESTION_TIMEOUT_SECS", "5"))
 FAST_MODEL_MAX_INPUT_TOKENS = int(os.environ.get("ANTHROPIC_FAST_MODEL_MAX_INPUT_TOKENS", "120000"))
+CHAT_TEMPERATURE = float(os.environ.get("ANTHROPIC_CHAT_TEMPERATURE", "0"))
 
 _REFERENCE_PATH = Path(__file__).resolve().parent.parent / "reference.md"
 try:
@@ -372,6 +373,7 @@ async def generate_stream():
                         stream_kwargs: Dict[str, Any] = {
                             "model": model,
                             "max_tokens": 16000,
+                            "temperature": CHAT_TEMPERATURE,
                             "system": system_blocks,
                             "messages": conversation,
                         }
@@ -495,7 +497,9 @@ async def generate_stream():
                     assistant_message["content"].append({"type": "tool_use", "id": tu["id"], "name": tu["name"], "input": tu["input"]})
                 conversation.append(assistant_message)
 
-                # Execute tools in parallel
+                # Execute tools in parallel and stream results as each finishes.
+                # The model-facing transcript below remains deterministic because
+                # it appends tool results in the original tool-call order.
                 logger.info(f"[CHAT] Executing {len(tool_uses)} tools: {[t['name'] for t in tool_uses]}")
 
                 async def execute_tool_async(tu):