From 9fd7354182ed506d02f806ed1ac9e2dfe81ebf46 Mon Sep 17 00:00:00 2001
From: SakshiKekre <sakshi.kekre@gmail.com>
Date: Wed, 13 May 2026 00:20:16 -0700
Subject: [PATCH] Add model backend selector and Claude Agent SDK runner

Backends: uk_compiled (Rust) and uk_python (Python), selected per
request or via POLICYENGINE_CHAT_BACKEND. Each controls its own prompt
context, tool description, and execution globals. ChatPage exposes a
toggle persisted in localStorage. /chat/backends lists what's available.

Rebased onto main: keeps main's structural plan-mode enforcement and
cached-reference cache breakpoints; backend prompt context is injected
into the cached first block via a template placeholder.

Claude Agent SDK runner is opt-in
(POLICYENGINE_CHAT_AGENT_RUNNER=claude_sdk), mirrors the existing SSE
event contract, exposes run_python via an in-process MCP tool.

Also: modal_app installs from backend/requirements.txt, pins
policyengine_uk==2.88.0, adds GA tracking.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/agent_tools.py             | 166 ++++++++-----
 backend/claude_agent_sdk_runner.py | 281 ++++++++++++++++++++++
 backend/model_backends.py          | 370 +++++++++++++++++++++++++++++
 backend/requirements.txt           |   4 +-
 backend/routes/chatbot.py          | 102 +++++---
 backend/tests/test_agent_tools.py  |  29 +++
 backend/tests/test_api.py          |  30 ++-
 frontend/src/app/ChatPage.tsx      | 155 +++++++++---
 frontend/src/app/layout.tsx        |  16 ++
 modal_app.py                       |  16 +-
 10 files changed, 1023 insertions(+), 146 deletions(-)
 create mode 100644 backend/claude_agent_sdk_runner.py
 create mode 100644 backend/model_backends.py

diff --git a/backend/agent_tools.py b/backend/agent_tools.py
index aaefed2..fa902ff 100644
--- a/backend/agent_tools.py
+++ b/backend/agent_tools.py
@@ -10,6 +10,8 @@
 import sys
 from typing import Any, Dict, List, Optional
 
+from model_backends import get_backend, make_backend_importer
+
 logger = logging.getLogger(__name__)
 
 
@@ -538,6 +540,59 @@ def analyse_microdata(
         return {"error": str(e)}
 
 
+def compute(
+    operation: str,
+    data: List[float],
+    other: Optional[List[float]] = None,
+) -> Dict[str, Any]:
+    """Small numeric helper retained for older tests/tool dispatch paths."""
+    if not data:
+        return {"error": "data must be non-empty"}
+
+    try:
+        if operation == "diff":
+            return {
+                "result": [
+                    data[i + 1] - data[i] for i in range(len(data) - 1)
+                ]
+            }
+        if operation == "pct_change":
+            return {
+                "result": [
+                    0 if data[i] == 0 else (data[i + 1] - data[i]) / data[i] * 100
+                    for i in range(len(data) - 1)
+                ]
+            }
+        if operation == "mean":
+            return {"result": sum(data) / len(data)}
+        if operation == "sum":
+            return {"result": sum(data)}
+        if operation in {"subtract", "divide", "marginal_rate"}:
+            if other is None:
+                return {"error": f"{operation} requires other"}
+            if len(data) != len(other):
+                return {"error": "data and other must have the same length"}
+            if operation == "subtract":
+                return {"result": [a - b for a, b in zip(data, other)]}
+            if operation == "divide":
+                return {
+                    "result": [
+                        0 if b == 0 else a / b for a, b in zip(data, other)
+                    ]
+                }
+            return {
+                "result": [
+                    0
+                    if other[i + 1] == other[i]
+                    else 100 * (data[i + 1] - data[i]) / (other[i + 1] - other[i])
+                    for i in range(len(data) - 1)
+                ]
+            }
+        return {"error": f"Unknown operation: {operation}"}
+    except Exception as e:
+        return {"error": str(e)}
+
+
 def generate_chart(
     chart_type: str, title: str, data: List[Dict[str, Any]], x_field: str, y_fields: List[str],
     x_label: Optional[str] = None, y_label: Optional[str] = None,
@@ -578,29 +633,10 @@ def generate_chart(
         return {"error": str(e)}
 
 
-def run_python(code: str) -> Dict[str, Any]:
-    """Execute Python code with the PolicyEngine UK compiled interface preloaded.
-
-    The code should assign its final result to a variable called `result`.
-    The environment includes the official Python wrapper so runs are easy to
-    reproduce outside the chat app.
-    """
-    import math
+def _safe_builtins_for_backend(backend_id: str) -> Dict[str, Any]:
     import builtins as _builtins
-    _ensure_compiled_package_importable()
-    import pandas as pd
-    import policyengine_uk_compiled as pe
-
-    from policyengine_uk_compiled import (
-        Simulation,
-        StructuralReform,
-        Parameters,
-        aggregate_microdata,
-        combine_microdata,
-        capabilities,
-        ensure_dataset,
-    )
 
+    backend = get_backend(backend_id)
     safe_names = (
         "range", "len", "int", "float", "str", "bool", "list", "dict",
         "tuple", "set", "zip", "enumerate", "map", "filter", "sorted",
@@ -609,37 +645,33 @@ def run_python(code: str) -> Dict[str, Any]:
         "print", "any", "all", "pow", "divmod", "complex", "type",
         "dir", "hasattr", "getattr",
     )
-    safe_builtins = {k: getattr(_builtins, k) for k in safe_names if hasattr(_builtins, k)}
+    safe_builtins = {
+        k: getattr(_builtins, k) for k in safe_names if hasattr(_builtins, k)
+    }
+    safe_builtins["__import__"] = make_backend_importer(backend)
+    return safe_builtins
 
-    try:
-        import numpy as np
-    except ImportError:
-        np = None
+
+def run_python(code: str, backend_id: str = "uk_compiled") -> Dict[str, Any]:
+    """Execute Python code with the selected model backend preloaded.
+
+    The code should assign its final result to a variable called `result`.
+    The backend adapter controls the model-specific globals made available.
+    """
+    backend = get_backend(backend_id)
+    safe_builtins = _safe_builtins_for_backend(backend.id)
 
     output_lines: List[str] = []
     def safe_print(*args, **kwargs):
         output_lines.append(" ".join(str(a) for a in args))
 
     safe_builtins["print"] = safe_print
-    safe_builtins["__import__"] = _safe_import
 
-    allowed_globals: Dict[str, Any] = {
-        "__builtins__": safe_builtins,
-        "math": math,
-        "json": json,
-        "pd": pd,
-        "pe": pe,
-        "Simulation": Simulation,
-        "StructuralReform": StructuralReform,
-        "Parameters": Parameters,
-        "aggregate_microdata": aggregate_microdata,
-        "combine_microdata": combine_microdata,
-        "capabilities": capabilities,
-        "ensure_dataset": ensure_dataset,
-    }
-    if np is not None:
-        allowed_globals["np"] = np
-        allowed_globals["numpy"] = np
+    try:
+        allowed_globals = backend.execution_globals()
+    except Exception as e:
+        return {"error": f"Backend import failed for '{backend.id}': {type(e).__name__}: {e}"}
+    allowed_globals["__builtins__"] = safe_builtins
 
     try:
         exec(code, allowed_globals)
@@ -656,6 +688,7 @@ def safe_print(*args, **kwargs):
     if not response:
         response["result"] = None
         response["note"] = "No 'result' variable was set and nothing was printed."
+    response["backend"] = backend.id
     return response
 
 
@@ -685,10 +718,16 @@ def _run_generator(code: str) -> Dict[str, Any]:
     return result
 
 
-def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
-    logger.info(f"[TOOLS] Executing {tool_name}")
+def execute_tool(
+    tool_name: str,
+    tool_input: Dict[str, Any],
+    backend_id: str = "uk_compiled",
+) -> Dict[str, Any]:
+    logger.info(f"[TOOLS] Executing {tool_name} with backend={backend_id}")
     tools = {
         "run_python": run_python,
+        "compute": compute,
+        "generate_chart": generate_chart,
     }
     if tool_name not in tools:
         return {"error": f"Unknown tool: {tool_name}"}
@@ -698,6 +737,8 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
             logger.info(f"[TOOLS] Running generator for {tool_name}")
             tool_input = _run_generator(tool_input["generator"])
             logger.info(f"[TOOLS] Generator produced keys: {list(tool_input.keys())}")
+        if tool_name == "run_python" and "backend_id" not in tool_input:
+            tool_input = {**tool_input, "backend_id": backend_id}
         result = tools[tool_name](**tool_input)
         logger.info(f"[TOOLS] {tool_name} completed")
         return result
@@ -706,16 +747,27 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
         return {"error": str(e)}
 
 
-TOOL_DEFINITIONS = [
-    {
-        "name": "run_python",
-        "description": "Execute reproducible Python code using the official PolicyEngine UK compiled interface. The environment preloads `policyengine_uk_compiled` as `pe`, plus `Simulation`, `Parameters`, `StructuralReform`, `aggregate_microdata`, `combine_microdata`, `capabilities`, `ensure_dataset`, `pd`, `np`, `json`, and `math`. Assign the final answer to `result` and use `print()` for intermediate output.",
-        "input_schema": {
-            "type": "object",
-            "properties": {
-                "code": {"type": "string", "description": "Python code to execute. Must assign the final answer to `result`. Use the preloaded PolicyEngine interface directly, for example: `sim = Simulation(year=2025)` or `policy = Parameters.model_validate({...})`."},
+def get_tool_definitions(backend_id: str = "uk_compiled") -> List[Dict[str, Any]]:
+    backend = get_backend(backend_id)
+    return [
+        {
+            "name": "run_python",
+            "description": backend.tool_description(),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": (
+                            "Python code to execute. Must assign the final answer "
+                            "to `result`. Use the preloaded model interface directly."
+                        ),
+                    },
+                },
+                "required": ["code"],
             },
-            "required": ["code"],
         },
-    },
-]
+    ]
+
+
+TOOL_DEFINITIONS = get_tool_definitions("uk_compiled")
diff --git a/backend/claude_agent_sdk_runner.py b/backend/claude_agent_sdk_runner.py
new file mode 100644
index 0000000..804e438
--- /dev/null
+++ b/backend/claude_agent_sdk_runner.py
@@ -0,0 +1,281 @@
+"""
+Experimental Claude Agent SDK runner for the chat endpoint.
+
+This is deliberately opt-in. It lets us compare the Claude Agent SDK harness
+against the existing direct Anthropic Messages loop while keeping the same
+PolicyEngine backend registry and frontend SSE contract.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any, AsyncIterator, Dict, List
+
+from agent_tools import execute_tool
+from model_backends import get_backend
+
+logger = logging.getLogger(__name__)
+
+
+def _conversation_prompt(messages: List[dict]) -> str:
+    lines = [
+        "Continue this chat transcript. Respond only to the latest user message.",
+        "",
+    ]
+    for message in messages:
+        role = "User" if message.get("role") == "user" else "Assistant"
+        lines.append(f"{role}: {message.get('content', '')}")
+        lines.append("")
+    return "\n".join(lines).strip()
+
+
+def _serialise_tool_result(result: Any) -> str:
+    return json.dumps(result, ensure_ascii=False, default=str)
+
+
+def _usage_totals(usage: dict[str, Any] | None) -> dict[str, int]:
+    if not usage:
+        return {
+            "input_tokens": 0,
+            "output_tokens": 0,
+            "cache_creation_input_tokens": 0,
+            "cache_read_input_tokens": 0,
+        }
+    return {
+        "input_tokens": int(usage.get("input_tokens", 0) or 0),
+        "output_tokens": int(usage.get("output_tokens", 0) or 0),
+        "cache_creation_input_tokens": int(
+            usage.get("cache_creation_input_tokens", 0) or 0
+        ),
+        "cache_read_input_tokens": int(usage.get("cache_read_input_tokens", 0) or 0),
+    }
+
+
+def _event_type(event: dict[str, Any]) -> str:
+    return str(event.get("type", ""))
+
+
+def _content_block(event: dict[str, Any]) -> dict[str, Any]:
+    block = event.get("content_block")
+    return block if isinstance(block, dict) else {}
+
+
+def _delta(event: dict[str, Any]) -> dict[str, Any]:
+    delta = event.get("delta")
+    return delta if isinstance(delta, dict) else {}
+
+
+async def generate_claude_agent_sdk_stream(
+    *,
+    conversation: List[dict],
+    system_prompt: str,
+    plan_mode: bool,
+    session_id: str,
+    user_id: str | None,
+    backend_id: str,
+    model: str,
+) -> AsyncIterator[str]:
+    try:
+        from claude_agent_sdk import ClaudeAgentOptions, create_sdk_mcp_server, query, tool
+        from claude_agent_sdk.types import (
+            AssistantMessage,
+            ResultMessage,
+            StreamEvent,
+            TextBlock,
+            ToolResultBlock,
+            ToolUseBlock,
+        )
+    except ImportError as exc:
+        yield f"data: {json.dumps({'type': 'error', 'content': f'Claude Agent SDK is not installed: {exc}'})}\n\n"
+        return
+
+    backend = get_backend(backend_id)
+
+    @tool(
+        "run_python",
+        backend.tool_description(),
+        {
+            "type": "object",
+            "properties": {
+                "code": {
+                    "type": "string",
+                    "description": (
+                        "Python code to execute. Must assign the final answer to "
+                        "`result`. Use the preloaded model interface directly."
+                    ),
+                }
+            },
+            "required": ["code"],
+        },
+    )
+    async def run_python_tool(args: dict[str, Any]) -> dict[str, Any]:
+        result = execute_tool("run_python", {"code": args.get("code", "")}, backend.id)
+        tool_result_queue.append(_queueable_tool_result(result))
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": _serialise_tool_result(result),
+                }
+            ]
+        }
+
+    mcp_server = create_sdk_mcp_server(
+        name="policyengine",
+        version="0.1.0",
+        tools=[run_python_tool],
+    )
+    allowed_tools = [] if plan_mode else ["mcp__policyengine__run_python"]
+    options = ClaudeAgentOptions(
+        system_prompt=system_prompt,
+        model=model,
+        mcp_servers={"policyengine": mcp_server},
+        allowed_tools=allowed_tools,
+        permission_mode="plan" if plan_mode else "default",
+        include_partial_messages=True,
+        max_turns=1 if plan_mode else 60,
+    )
+
+    assistant_content = ""
+    usage = _usage_totals(None)
+    tool_inputs: Dict[str, dict[str, Any]] = {}
+    tool_names: Dict[str, str] = {}
+    tool_order: List[str] = []
+    tool_result_queue: List[dict[str, Any]] = []
+    emitted_tool_results: set[str] = set()
+    announced_tool_ids: set[str] = set()
+
+    def _queueable_tool_result(result: dict[str, Any]) -> dict[str, Any]:
+        result_json = _serialise_tool_result(result)
+        return {
+            "status": "error" if result.get("error") else "success",
+            "result_summary": (
+                result_json[:5000] + "..." if len(result_json) > 5000 else result_json
+            ),
+        }
+
+    def _flush_tool_result_events() -> list[str]:
+        events = []
+        while tool_result_queue:
+            tool_id = next(
+                (candidate for candidate in tool_order if candidate not in emitted_tool_results),
+                None,
+            )
+            if not tool_id:
+                break
+            emitted_tool_results.add(tool_id)
+            result = tool_result_queue.pop(0)
+            events.append(
+                f"data: {json.dumps({'type': 'tool_result', 'tool_name': 'run_python', 'tool_id': tool_id, 'status': result['status'], 'result_summary': result['result_summary']})}\n\n"
+            )
+        return events
+
+    try:
+        async for message in query(
+            prompt=_conversation_prompt(conversation),
+            options=options,
+        ):
+            for event in _flush_tool_result_events():
+                yield event
+
+            if isinstance(message, StreamEvent):
+                event = message.event
+                event_type = _event_type(event)
+                if event_type == "content_block_start":
+                    block = _content_block(event)
+                    if block.get("type") == "tool_use":
+                        tool_id = str(block.get("id", ""))
+                        tool_name = str(block.get("name", "run_python"))
+                        if tool_id:
+                            tool_names[tool_id] = tool_name
+                            tool_inputs.setdefault(tool_id, {})
+                            if tool_id not in tool_order:
+                                tool_order.append(tool_id)
+                        if tool_id and tool_id not in announced_tool_ids:
+                            announced_tool_ids.add(tool_id)
+                            yield f"data: {json.dumps({'type': 'tool_start', 'tool_name': 'run_python', 'tool_id': tool_id})}\n\n"
+                elif event_type == "content_block_delta":
+                    delta = _delta(event)
+                    if delta.get("type") == "text_delta" and delta.get("text"):
+                        text = str(delta.get("text", ""))
+                        assistant_content += text
+                        yield f"data: {json.dumps({'type': 'chunk', 'content': text})}\n\n"
+
+            elif isinstance(message, AssistantMessage):
+                if message.usage:
+                    usage = _usage_totals(message.usage)
+                for block in message.content:
+                    if isinstance(block, TextBlock):
+                        if not assistant_content and block.text:
+                            assistant_content += block.text
+                            yield f"data: {json.dumps({'type': 'chunk', 'content': block.text})}\n\n"
+                    elif isinstance(block, ToolUseBlock):
+                        tool_names[block.id] = block.name
+                        tool_inputs[block.id] = block.input
+                        if block.id not in tool_order:
+                            tool_order.append(block.id)
+                        yield f"data: {json.dumps({'type': 'tool_use', 'tool_name': 'run_python', 'tool_id': block.id, 'tool_input': block.input, 'status': 'pending'})}\n\n"
+                    elif isinstance(block, ToolResultBlock):
+                        content = block.content
+                        if isinstance(content, list):
+                            content_text = "\n".join(
+                                str(item.get("text", item))
+                                for item in content
+                                if isinstance(item, dict)
+                            )
+                        else:
+                            content_text = str(content or "")
+                        result_summary = (
+                            content_text[:5000] + "..."
+                            if len(content_text) > 5000
+                            else content_text
+                        )
+                        yield f"data: {json.dumps({'type': 'tool_result', 'tool_name': 'run_python', 'tool_id': block.tool_use_id, 'status': 'error' if block.is_error else 'success', 'result_summary': result_summary})}\n\n"
+
+            elif isinstance(message, ResultMessage):
+                for event in _flush_tool_result_events():
+                    yield event
+                if message.usage:
+                    usage = _usage_totals(message.usage)
+                if message.result and not assistant_content:
+                    assistant_content = message.result
+                    yield f"data: {json.dumps({'type': 'chunk', 'content': message.result})}\n\n"
+                billing = None
+                try:
+                    from routes.billing import record_usage
+
+                    billing = record_usage(
+                        user_id=user_id,
+                        session_id=session_id,
+                        model=model,
+                        input_tokens=usage["input_tokens"],
+                        output_tokens=usage["output_tokens"],
+                        cache_creation_input_tokens=usage[
+                            "cache_creation_input_tokens"
+                        ],
+                        cache_read_input_tokens=usage["cache_read_input_tokens"],
+                    )
+                except Exception as exc:
+                    logger.warning(f"[CHAT][SDK] Failed to record usage: {exc}")
+
+                done = {
+                    "type": "done",
+                    "content": assistant_content,
+                    "session_id": session_id,
+                    "model": model,
+                    "model_backend": backend.id,
+                    "agent_runner": "claude_sdk",
+                    "usage": usage,
+                    "cost_gbp": billing["cost_gbp"] if billing else None,
+                    "balance": billing["balance"] if billing else None,
+                    "sdk_session_id": message.session_id,
+                    "sdk_cost_usd": message.total_cost_usd,
+                }
+                yield f"data: {json.dumps(done)}\n\n"
+                return
+
+        yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'model_backend': backend.id, 'agent_runner': 'claude_sdk', 'usage': usage, 'cost_gbp': None, 'balance': None})}\n\n"
+    except Exception as exc:
+        logger.exception("[CHAT][SDK] Claude Agent SDK runner failed")
+        yield f"data: {json.dumps({'type': 'error', 'content': str(exc)})}\n\n"
diff --git a/backend/model_backends.py b/backend/model_backends.py
new file mode 100644
index 0000000..f21d52f
--- /dev/null
+++ b/backend/model_backends.py
@@ -0,0 +1,370 @@
+"""
+Model backend adapters for the chat agent.
+
+The chat UI currently exposes one flexible Python execution tool. These
+adapters keep that contract stable while allowing the preloaded model
+interface, prompt context, and tool documentation to vary by backend.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from importlib.metadata import PackageNotFoundError, version
+import json
+import math
+from pathlib import Path
+import sys
+from typing import Any, Callable, Dict, Iterable
+
+
+class BackendImportError(RuntimeError):
+    """Raised when a selected model backend cannot be imported."""
+
+
+def _ensure_sibling_package_importable(
+    import_name: str,
+    sibling_candidates: Iterable[Path],
+) -> None:
+    try:
+        __import__(import_name)
+        return
+    except ModuleNotFoundError:
+        pass
+
+    added_paths = []
+    for candidate in sibling_candidates:
+        if candidate.is_dir():
+            candidate_str = str(candidate)
+            if candidate_str not in sys.path:
+                sys.path.insert(0, candidate_str)
+                added_paths.append(candidate_str)
+
+    try:
+        __import__(import_name)
+        return
+    except ModuleNotFoundError as exc:
+        detail = str(exc)
+
+    raise BackendImportError(
+        f"{import_name} is not importable. Install the package or make sure a "
+        "local checkout is available next to policyengine-uk-chat. "
+        f"Added paths: {added_paths or 'none'}. Import error: {detail}"
+    )
+
+
+@dataclass(frozen=True)
+class ModelBackend:
+    id: str
+    display_name: str
+    package_name: str
+    package_label: str
+    import_roots: frozenset[str]
+
+    def package_version(self) -> str:
+        try:
+            return version(self.package_name)
+        except PackageNotFoundError:
+            return "unknown"
+
+    def prompt_context(self) -> str:
+        raise NotImplementedError
+
+    def tool_description(self) -> str:
+        raise NotImplementedError
+
+    def execution_globals(self) -> Dict[str, Any]:
+        raise NotImplementedError
+
+
+class UKCompiledBackend(ModelBackend):
+    def __init__(self) -> None:
+        super().__init__(
+            id="uk_compiled",
+            display_name="PolicyEngine UK compiled Rust backend",
+            package_name="policyengine-uk-compiled",
+            package_label="policyengine-uk-compiled",
+            import_roots=frozenset(
+                {"json", "math", "numpy", "pandas", "policyengine_uk_compiled"}
+            ),
+        )
+
+    def _ensure_importable(self) -> None:
+        repo_parent = Path(__file__).resolve().parents[2]
+        _ensure_sibling_package_importable(
+            "policyengine_uk_compiled",
+            [
+                repo_parent / "policyengine-uk-rust" / "interfaces" / "python",
+                repo_parent
+                / "policyengine-uk-rust-codex-debug-issue"
+                / "interfaces"
+                / "python",
+            ],
+        )
+
+    def prompt_context(self) -> str:
+        return """CRITICAL - USE THE OFFICIAL POLICYENGINE UK COMPILED INTERFACE:
+- The selected backend is `uk_compiled`, the Rust-backed UK engine exposed through `policyengine_uk_compiled`.
+- The Python environment preloads:
+  `policyengine_uk_compiled` as `pe`
+  `Simulation`
+  `Parameters`
+  `StructuralReform`
+  `aggregate_microdata`
+  `combine_microdata`
+  `capabilities`
+  `ensure_dataset`
+  `pd`, `np`, `json`, `math`
+- Prefer writing code directly against those objects so the run is reproducible outside chat.
+- Do not recreate policy logic manually if the package already provides it.
+
+COMMON WORKFLOWS FOR THIS BACKEND:
+- Baseline economy-wide run:
+  `caps = capabilities()`
+  `sim = Simulation(year=2025, dataset="frs")`
+  `result = sim.run().model_dump()`
+- Reform run:
+  `policy = Parameters.model_validate({"income_tax": {"personal_allowance": 15000}})`
+  `result = sim.run(policy=policy).model_dump()`
+- Custom household run:
+  build `persons`, `benunits`, and `households` DataFrames, then pass them to `Simulation(...)`
+- Microdata analysis:
+  `micro = sim.run_microdata(...)` then analyse `micro.persons`, `micro.benunits`, or `micro.households` with pandas
+
+MODELLING SCOPE:
+- The compiled backend covers the model surface exposed by `policyengine_uk_compiled`.
+- Use `capabilities()` to check what is available locally before committing to an approach."""
+
+    def tool_description(self) -> str:
+        return (
+            "Execute reproducible Python code using the PolicyEngine UK compiled "
+            "backend. The environment preloads `policyengine_uk_compiled` as `pe`, "
+            "plus `Simulation`, `Parameters`, `StructuralReform`, "
+            "`aggregate_microdata`, `combine_microdata`, `capabilities`, "
+            "`ensure_dataset`, `pd`, `np`, `json`, and `math`. Assign the final "
+            "answer to `result` and use `print()` for short diagnostics."
+        )
+
+    def execution_globals(self) -> Dict[str, Any]:
+        self._ensure_importable()
+        import pandas as pd
+        import policyengine_uk_compiled as pe
+
+        try:
+            import numpy as np
+        except ImportError:
+            np = None
+
+        from policyengine_uk_compiled import (
+            Parameters,
+            Simulation,
+            StructuralReform,
+            aggregate_microdata,
+            capabilities,
+            combine_microdata,
+            ensure_dataset,
+        )
+
+        globals_dict: Dict[str, Any] = {
+            "math": math,
+            "json": json,
+            "pd": pd,
+            "pe": pe,
+            "Simulation": Simulation,
+            "StructuralReform": StructuralReform,
+            "Parameters": Parameters,
+            "aggregate_microdata": aggregate_microdata,
+            "combine_microdata": combine_microdata,
+            "capabilities": capabilities,
+            "ensure_dataset": ensure_dataset,
+        }
+        if np is not None:
+            globals_dict["np"] = np
+            globals_dict["numpy"] = np
+        return globals_dict
+
+
+class UKPolicyEnginePythonBackend(ModelBackend):
+    def __init__(self) -> None:
+        super().__init__(
+            id="uk_python",
+            display_name="PolicyEngine UK Python backend",
+            package_name="policyengine-uk",
+            package_label="policyengine-uk",
+            import_roots=frozenset(
+                {
+                    "json",
+                    "math",
+                    "numpy",
+                    "pandas",
+                    "policyengine",
+                    "policyengine_core",
+                    "policyengine_uk",
+                    "microdf",
+                }
+            ),
+        )
+
+    def _ensure_importable(self) -> None:
+        repo_parent = Path(__file__).resolve().parents[2]
+        _ensure_sibling_package_importable(
+            "policyengine_uk",
+            [
+                repo_parent / "policyengine-core",
+                repo_parent / "policyengine.py" / "src",
+                repo_parent / "policyengine-uk",
+            ],
+        )
+
+    def prompt_context(self) -> str:
+        return """CRITICAL - USE THE POLICYENGINE UK PYTHON MODEL INTERFACE:
+- The selected backend is `uk_python`, the Python `policyengine-uk` model package.
+- This is the detailed PolicyEngine Core/OpenFisca-style UK model, not the compiled Rust wrapper.
+- The Python environment preloads:
+  `policyengine_uk` as `pe`
+  `Simulation`
+  `Microsimulation`
+  `CountryTaxBenefitSystem`
+  `Scenario`
+  `capabilities`
+  `pd`, `np`, `json`, `math`
+- If installed, the higher-level `policyengine` package is also preloaded as `policyengine`.
+- Prefer writing code against `policyengine_uk` objects and formulas rather than recreating policy logic.
+
+COMMON WORKFLOWS FOR THIS BACKEND:
+- First inspect backend details:
+  `result = capabilities()`
+- Custom household/situation run:
+  `sim = Simulation(situation={...})`
+  `result = sim.calculate("household_net_income", 2025).tolist()`
+- Microsimulation from published UK data:
+  `sim = Microsimulation(dataset="hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5")`
+  `result = sim.calculate("household_net_income", 2025).head().to_list()`
+- Parameter reform:
+  pass parameter changes through `Scenario` or mutate a simulation with documented `policyengine_uk` helpers.
+
+MODELLING SCOPE:
+- This backend exposes the Python `policyengine-uk` model surface. Its API, datasets, variables, and results can differ from `uk_compiled`.
+- If a dataset is unavailable locally or requires a download/token, report that clearly instead of guessing."""
+
+    def tool_description(self) -> str:
+        return (
+            "Execute reproducible Python code using the Python `policyengine-uk` "
+            "backend. The environment preloads `policyengine_uk` as `pe`, "
+            "`Simulation`, `Microsimulation`, `CountryTaxBenefitSystem`, "
+            "`Scenario`, `capabilities`, `pd`, `np`, `json`, and `math`; "
+            "the higher-level `policyengine` package is available when installed. "
+            "Assign the final answer to `result` and use `print()` for short diagnostics."
+        )
+
+    def execution_globals(self) -> Dict[str, Any]:
+        self._ensure_importable()
+        import pandas as pd
+        import policyengine_uk as pe
+
+        try:
+            import numpy as np
+        except ImportError:
+            np = None
+
+        try:
+            import policyengine
+        except ImportError:
+            policyengine = None
+
+        from policyengine_uk import (
+            CountryTaxBenefitSystem,
+            Microsimulation,
+            Simulation,
+        )
+        from policyengine_uk.utils.scenario import Scenario
+
+        def capabilities() -> Dict[str, Any]:
+            system = CountryTaxBenefitSystem()
+            variables = system.variables
+            parameters = system.parameters
+            return {
+                "backend": self.id,
+                "display_name": self.display_name,
+                "package": "policyengine-uk",
+                "interface": "Python PolicyEngine Core/OpenFisca-style model",
+                "preloaded": [
+                    "policyengine_uk as pe",
+                    "Simulation",
+                    "Microsimulation",
+                    "CountryTaxBenefitSystem",
+                    "Scenario",
+                    "pd",
+                    "np",
+                    "json",
+                    "math",
+                ],
+                "variable_count": len(variables),
+                "sample_variables": sorted(variables)[:50],
+                "parameter_root_children": sorted(parameters.children.keys()),
+                "dataset_notes": [
+                    "Pass a situation dict for household-style calculations.",
+                    "Pass a UKSingleYearDataset, UKMultiYearDataset, DataFrame, or hf:// URL for microsimulation.",
+                    "No default dataset is used unless POLICYENGINE_UK_DEFAULT_DATASET is set.",
+                ],
+                "comparison_note": (
+                    "Results may differ from uk_compiled because this backend uses "
+                    "the Python policyengine-uk model and its datasets/API surface."
+                ),
+            }
+
+        globals_dict: Dict[str, Any] = {
+            "math": math,
+            "json": json,
+            "pd": pd,
+            "pe": pe,
+            "policyengine_uk": pe,
+            "Simulation": Simulation,
+            "Microsimulation": Microsimulation,
+            "CountryTaxBenefitSystem": CountryTaxBenefitSystem,
+            "Scenario": Scenario,
+            "capabilities": capabilities,
+        }
+        if policyengine is not None:
+            globals_dict["policyengine"] = policyengine
+        if np is not None:
+            globals_dict["np"] = np
+            globals_dict["numpy"] = np
+        return globals_dict
+
+
+_BACKENDS: Dict[str, ModelBackend] = {
+    "uk_compiled": UKCompiledBackend(),
+    "uk_python": UKPolicyEnginePythonBackend(),
+}
+
+
+def available_backends() -> Dict[str, Dict[str, str]]:
+    return {
+        backend_id: {
+            "id": backend.id,
+            "display_name": backend.display_name,
+            "package_label": backend.package_label,
+            "version": backend.package_version(),
+        }
+        for backend_id, backend in _BACKENDS.items()
+    }
+
+
+def get_backend(backend_id: str | None = None) -> ModelBackend:
+    selected = backend_id or "uk_compiled"
+    if selected not in _BACKENDS:
+        valid = ", ".join(sorted(_BACKENDS))
+        raise ValueError(f"Unknown model backend '{selected}'. Valid backends: {valid}")
+    return _BACKENDS[selected]
+
+
+def make_backend_importer(backend: ModelBackend) -> Callable[..., Any]:
+    def _safe_import(name, globals=None, locals=None, fromlist=(), level=0):
+        root_name = name.split(".")[0]
+        if root_name not in backend.import_roots:
+            raise ImportError(
+                f"Import of '{name}' is not allowed for backend '{backend.id}'"
+            )
+        return __import__(name, globals, locals, fromlist, level)
+
+    return _safe_import
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 4feb984..5950419 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -3,10 +3,12 @@ uvicorn[standard]
 sqlmodel
 psycopg2-binary
 anthropic
+claude-agent-sdk
 pydantic-ai[anthropic]
 policyengine-uk-compiled>=0.20.0
-policyengine_uk
+policyengine_uk==2.88.0
 pandas
 httpx
 supabase
 stripe
+python-dateutil
diff --git a/backend/routes/chatbot.py b/backend/routes/chatbot.py
index fa1af54..3bc9d93 100644
--- a/backend/routes/chatbot.py
+++ b/backend/routes/chatbot.py
@@ -17,7 +17,8 @@
 from pydantic_ai.models.anthropic import AnthropicModel
 from pydantic_ai.settings import ModelSettings
 
-from agent_tools import execute_tool, TOOL_DEFINITIONS
+from agent_tools import execute_tool, get_tool_definitions
+from model_backends import available_backends, get_backend
 
 logger = logging.getLogger(__name__)
 
@@ -26,7 +27,7 @@
 # ---------------------------------------------------------------------------
 # System prompt
 # ---------------------------------------------------------------------------
-SYSTEM_PROMPT = """You are an expert policy analysis assistant for a UK microsimulation platform. You help users understand and analyse UK tax and benefit policy using reproducible Python code.
+SYSTEM_PROMPT_TEMPLATE = """You are an expert policy analysis assistant for a microsimulation platform. You help users understand and analyse tax and benefit policy using reproducible Python code.
 
 CRITICAL - ALWAYS COMPUTE WITH PYTHON:
 - Never answer quantitative policy questions from memory.
@@ -39,19 +40,7 @@
 - Use that to ground yourself in the available datasets, years, programmes, and caveats before you simulate.
 - If the user asks about something outside the modelled scope, say so clearly instead of guessing.
 
-CRITICAL - USE THE OFFICIAL POLICYENGINE PYTHON INTERFACE:
-- The Python environment preloads:
-  `policyengine_uk_compiled` as `pe`
-  `Simulation`
-  `Parameters`
-  `StructuralReform`
-  `aggregate_microdata`
-  `combine_microdata`
-  `capabilities`
-  `ensure_dataset`
-  `pd`, `np`, `json`, `math`
-- Prefer writing code directly against those objects so the run is reproducible outside chat.
-- Do not recreate policy logic manually if the package already provides it.
+{backend_prompt_context}
 
 REPRODUCIBILITY RULES:
 - Write clear Python that another developer could copy and run.
@@ -61,10 +50,10 @@
 - Do not rely on hidden reasoning for calculations when code can do the work.
 
 API AND DATASETS:
-- A live API reference (docstrings, `capabilities()` snapshot, full `Parameters` JSON schema) is attached to this system prompt — consult it for signatures, reform keys, and dataset descriptions rather than guessing.
+- A live API reference (docstrings, `capabilities()` snapshot, full `Parameters` JSON schema) is attached to this system prompt — consult it for signatures, reform keys, and dataset descriptions rather than guessing. The reference reflects the currently selected backend.
 - Call `capabilities()` at the start of a new line of analysis to check what's modelled and locally available before committing to an approach.
 - Tell the user which dataset you used when it matters.
-- If something is not modelled well enough for a quantitative answer, say so clearly and do not fabricate estimates.
+- If a dataset is unavailable in the selected backend, explain the limitation rather than fabricating estimates.
 
 ANALYTICAL NOTES:
 - Decile impacts are decile-level averages, not economy-wide means.
@@ -80,11 +69,18 @@
 """
 
 
+def _build_system_prompt(backend_id: str) -> str:
+    backend = get_backend(backend_id)
+    return SYSTEM_PROMPT_TEMPLATE.format(
+        backend_prompt_context=backend.prompt_context()
+    )
+
+
 # ---------------------------------------------------------------------------
 # Pydantic-AI agent setup
 # ---------------------------------------------------------------------------
 
-# We build the agent with tools dynamically from TOOL_DEFINITIONS
+# We build the agent with tools dynamically per backend via get_tool_definitions.
 # pydantic-ai uses its own tool registration, but we'll drive it through
 # our own SSE loop using the underlying model API directly for streaming.
 
@@ -123,18 +119,19 @@ def _get_sync_anthropic_client():
     return anthropic_sdk.Anthropic(api_key=api_key)
 
 
-def _tool_defs_for_anthropic():
-    """Convert our TOOL_DEFINITIONS to Anthropic SDK format.
+def _tool_defs_for_anthropic(backend_id: str):
+    """Convert backend tool definitions to Anthropic SDK format.
     Mark the last tool with cache_control so the system prompt + all tools
     are cached across requests (prompt caching)."""
     defs = []
-    for i, t in enumerate(TOOL_DEFINITIONS):
+    tool_definitions = get_tool_definitions(backend_id)
+    for i, t in enumerate(tool_definitions):
         d = {
             "name": t["name"],
             "description": t["description"],
             "input_schema": t["input_schema"],
         }
-        if i == len(TOOL_DEFINITIONS) - 1:
+        if i == len(tool_definitions) - 1:
             d["cache_control"] = {"type": "ephemeral"}
         defs.append(d)
     return defs
@@ -149,10 +146,10 @@ def _estimate_message_tokens(messages: List[dict]) -> int:
     return char_count // 4
 
 
-def _select_chat_model(messages: List[dict]) -> str:
+def _select_chat_model(messages: List[dict], backend_id: str) -> str:
     estimated_input_tokens = (
         _estimate_message_tokens(messages)
-        + len(SYSTEM_PROMPT) // 4
+        + len(_build_system_prompt(backend_id)) // 4
         + len(REFERENCE_DOC) // 4
     )
     if estimated_input_tokens > FAST_MODEL_MAX_INPUT_TOKENS:
@@ -160,7 +157,7 @@ def _select_chat_model(messages: List[dict]) -> str:
     return DEFAULT_FAST_MODEL
 
 
-def _build_system_blocks(plan_mode: bool = False) -> List[dict]:
+def _build_system_blocks(backend_id: str, plan_mode: bool = False) -> List[dict]:
     """System prompt + cached library reference + optional plan-mode directive.
 
     The system prompt and reference are each marked with cache_control so they
@@ -169,7 +166,7 @@ def _build_system_blocks(plan_mode: bool = False) -> List[dict]:
     """
     blocks: List[dict] = [{
         "type": "text",
-        "text": SYSTEM_PROMPT,
+        "text": _build_system_prompt(backend_id),
         "cache_control": {"type": "ephemeral"},
     }]
     if REFERENCE_DOC:
@@ -196,6 +193,7 @@ class ChatRequest(BaseModel):
     messages: List[ChatMessage]
     session_id: str | None = None
     user_id: str | None = None
+    model_backend: str | None = None
     plan_mode: bool = False
 
 
@@ -239,6 +237,14 @@ def generate_title(request: TitleRequest):
     return {"title": response.content[0].text.strip()}
 
 
+@router.get("/backends")
+def list_backends():
+    return {
+        "default": os.environ.get("POLICYENGINE_CHAT_BACKEND", "uk_compiled"),
+        "backends": available_backends(),
+    }
+
+
 # ---------------------------------------------------------------------------
 # Chat endpoint — SSE streaming
 # ---------------------------------------------------------------------------
@@ -257,6 +263,13 @@ async def chat_message(request: ChatRequest, http_request: Request):
             pass  # Supabase not configured — skip billing check
 
     session_id = request.session_id or str(uuid.uuid4())
+    backend_id = request.model_backend or os.environ.get(
+        "POLICYENGINE_CHAT_BACKEND", "uk_compiled"
+    )
+    try:
+        backend = get_backend(backend_id)
+    except ValueError as e:
+        return JSONResponse(status_code=400, content={"error": str(e)})
 
     messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
 
@@ -268,6 +281,27 @@ async def chat_message(request: ChatRequest, http_request: Request):
         else:
             deduplicated[-1]["content"] += "\n\n" + msg["content"]
 
+    if os.environ.get("POLICYENGINE_CHAT_AGENT_RUNNER") == "claude_sdk":
+        from claude_agent_sdk_runner import generate_claude_agent_sdk_stream
+
+        # The SDK runner takes a single system_prompt string. Plan mode is still
+        # enforced inside it (see runner). Cached reference doc is *not* sent
+        # through this path yet — the SDK runner is opt-in/experimental.
+        return StreamingResponse(
+            generate_claude_agent_sdk_stream(
+                conversation=deduplicated.copy(),
+                system_prompt=_build_system_prompt(backend.id)
+                + ("\n\n" + PLAN_MODE_DIRECTIVE if request.plan_mode else ""),
+                plan_mode=request.plan_mode,
+                session_id=session_id,
+                user_id=user_id,
+                backend_id=backend.id,
+                model=_select_chat_model(deduplicated, backend.id),
+            ),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+        )
+
     async def generate_stream():
         try:
             conversation = deduplicated.copy()
@@ -280,13 +314,13 @@ async def generate_stream():
             recent_tool_calls: List[str] = []
 
             client = _get_anthropic_client()
-            model = _select_chat_model(conversation)
-            tools = _tool_defs_for_anthropic()
+            model = _select_chat_model(conversation, backend.id)
+            tools = _tool_defs_for_anthropic(backend.id)
             plan_mode = request.plan_mode
-            system_blocks = _build_system_blocks(plan_mode=plan_mode)
+            system_blocks = _build_system_blocks(backend.id, plan_mode=plan_mode)
 
             logger.info(
-                f"[CHAT] Session {session_id}: {len(conversation)} messages"
+                f"[CHAT] Session {session_id}: {len(conversation)} messages, backend={backend.id}"
                 f"{' [PLAN MODE]' if plan_mode else ''}"
             )
 
@@ -394,7 +428,7 @@ async def generate_stream():
                         )
                     except Exception as e:
                         logger.warning(f"[CHAT] Failed to record usage: {e}")
-                    yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n"
+                    yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'model_backend': backend.id, 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n"
                     break
 
                 # Detect infinite loops
@@ -420,7 +454,9 @@ async def generate_stream():
                 async def execute_tool_async(tu):
                     loop = asyncio.get_event_loop()
                     logger.info(f"[CHAT] Starting tool: {tu['name']} input={tu['input']}")
-                    result = await loop.run_in_executor(None, execute_tool, tu["name"], tu["input"])
+                    result = await loop.run_in_executor(
+                        None, execute_tool, tu["name"], tu["input"], backend.id
+                    )
                     logger.info(f"[CHAT] Finished tool: {tu['name']} result_keys={list(result.keys()) if isinstance(result, dict) else type(result)}")
                     return tu, result
 
@@ -475,7 +511,7 @@ async def execute_tool_async(tu):
                 except Exception as e:
                     logger.warning(f"[CHAT] Failed to record usage: {e}")
                 yield f"data: {json.dumps({'type': 'chunk', 'content': '\\n\\n*[Reached maximum iterations]*'})}\n\n"
-                yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n"
+                yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'model_backend': backend.id, 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n"
 
         except Exception as e:
             import traceback
diff --git a/backend/tests/test_agent_tools.py b/backend/tests/test_agent_tools.py
index 0740e52..773a316 100644
--- a/backend/tests/test_agent_tools.py
+++ b/backend/tests/test_agent_tools.py
@@ -11,10 +11,39 @@
     compute,
     generate_chart,
     execute_tool,
+    get_tool_definitions,
     _build_compiled_policy,
     _json_safe,
     run_python,
 )
+from model_backends import available_backends, get_backend
+
+
+# ---------------------------------------------------------------------------
+# model backends
+# ---------------------------------------------------------------------------
+
+class TestModelBackends:
+    def test_available_backends_include_compiled_and_python(self):
+        backends = available_backends()
+        assert "uk_compiled" in backends
+        assert "uk_python" in backends
+        assert backends["uk_compiled"]["package_label"] == "policyengine-uk-compiled"
+        assert backends["uk_python"]["package_label"] == "policyengine-uk"
+        assert "version" in backends["uk_compiled"]
+        assert "version" in backends["uk_python"]
+
+    def test_backend_tool_descriptions_are_backend_specific(self):
+        compiled_tools = get_tool_definitions("uk_compiled")
+        python_tools = get_tool_definitions("uk_python")
+        assert compiled_tools[0]["name"] == "run_python"
+        assert python_tools[0]["name"] == "run_python"
+        assert "compiled" in compiled_tools[0]["description"]
+        assert "policyengine-uk" in python_tools[0]["description"]
+
+    def test_unknown_backend_rejected(self):
+        with pytest.raises(ValueError):
+            get_backend("not_a_backend")
 
 
 # ---------------------------------------------------------------------------
diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py
index b552c99..e47ee4a 100644
--- a/backend/tests/test_api.py
+++ b/backend/tests/test_api.py
@@ -23,6 +23,20 @@ def test_health(self):
         assert r.json()["status"] == "ok"
 
 
+class TestChatBackends:
+    def test_lists_backends(self):
+        r = client.get("/chat/backends")
+        assert r.status_code == 200
+        data = r.json()
+        assert data["default"] == "uk_compiled"
+        assert "uk_compiled" in data["backends"]
+        assert "uk_python" in data["backends"]
+        assert data["backends"]["uk_compiled"]["package_label"] == "policyengine-uk-compiled"
+        assert data["backends"]["uk_python"]["package_label"] == "policyengine-uk"
+        assert "version" in data["backends"]["uk_compiled"]
+        assert "version" in data["backends"]["uk_python"]
+
+
 # ---------------------------------------------------------------------------
 # Conversations CRUD
 # ---------------------------------------------------------------------------
@@ -172,6 +186,14 @@ def parse_sse(response_text: str) -> list[dict]:
 
 
 class TestChatMessage:
+    def test_unknown_model_backend_returns_400(self):
+        r = client.post("/chat/message", json={
+            "messages": [{"role": "user", "content": "hello"}],
+            "model_backend": "not_a_backend",
+        })
+        assert r.status_code == 400
+        assert "Unknown model backend" in r.json()["error"]
+
     def test_simple_chat_returns_sse(self):
         with client.stream("POST", "/chat/message", json={
             "messages": [{"role": "user", "content": "Say exactly: hello"}],
@@ -267,18 +289,18 @@ class TestPlanMode:
 
     def test_directive_present_when_plan_mode_on(self):
         from routes.chatbot import _build_system_blocks, PLAN_MODE_DIRECTIVE
-        blocks = _build_system_blocks(plan_mode=True)
+        blocks = _build_system_blocks("uk_compiled", plan_mode=True)
         assert any(PLAN_MODE_DIRECTIVE in b.get("text", "") for b in blocks)
 
     def test_directive_absent_when_plan_mode_off(self):
         from routes.chatbot import _build_system_blocks, PLAN_MODE_DIRECTIVE
-        blocks = _build_system_blocks(plan_mode=False)
+        blocks = _build_system_blocks("uk_compiled", plan_mode=False)
         assert not any(PLAN_MODE_DIRECTIVE in b.get("text", "") for b in blocks)
 
     def test_base_prompt_cache_breakpoint_unchanged(self):
         from routes.chatbot import _build_system_blocks
-        on = _build_system_blocks(plan_mode=True)
-        off = _build_system_blocks(plan_mode=False)
+        on = _build_system_blocks("uk_compiled", plan_mode=True)
+        off = _build_system_blocks("uk_compiled", plan_mode=False)
         assert on[0] == off[0]
         assert "cache_control" in on[0]
 
diff --git a/frontend/src/app/ChatPage.tsx b/frontend/src/app/ChatPage.tsx
index b650092..bcef580 100644
--- a/frontend/src/app/ChatPage.tsx
+++ b/frontend/src/app/ChatPage.tsx
@@ -125,6 +125,29 @@ interface BalanceSummary {
   total_available_gbp: number;
 }
 
+interface ModelBackendOption {
+  id: string;
+  display_name: string;
+  package_label: string;
+  version: string;
+}
+
+interface ModelBackendsResponse {
+  default: string;
+  backends: Record<string, ModelBackendOption>;
+}
+
+function formatBackendLabel(backend: ModelBackendOption): string {
+  if (backend.id === "uk_compiled") return "Compiled";
+  if (backend.id === "uk_python") return "Python";
+  return backend.display_name;
+}
+
+function formatBackendVersion(backend: ModelBackendOption | undefined): string | null {
+  if (!backend?.package_label || !backend.version || backend.version === "unknown") return null;
+  return `${backend.package_label} v${backend.version}`;
+}
+
 async function apiRequest<T>(method: string, endpoint: string, params?: Record<string, string>, body?: unknown): Promise<T> {
   const url = new URL(getBackendEndpoint(endpoint), window.location.origin);
   if (params) Object.entries(params).forEach(([k, v]) => url.searchParams.set(k, v));
@@ -163,18 +186,21 @@ export default function ChatPage() {
   const [reportNote, setReportNote] = useState("");
   const [reportError, setReportError] = useState<string | null>(null);
   const [reportSubmitting, setReportSubmitting] = useState(false);
-  const [planMode, setPlanMode] = useState(false);
   const scrollRef = useRef<HTMLDivElement>(null);
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const sessionId = useRef<string | null>(null);
   const debugLog = useRef<string[]>([]);
   const abortRef = useRef<AbortController | null>(null);
 
-  const [modelVersion, setModelVersion] = useState<string | null>(null);
+  const [modelBackends, setModelBackends] = useState<ModelBackendOption[]>([]);
+  const [selectedBackendId, setSelectedBackendId] = useState("uk_compiled");
   const [balance, setBalance] = useState<BalanceSummary | null>(null);
   const [topUpLoading, setTopUpLoading] = useState(false);
+  const [planMode, setPlanMode] = useState(false);
   const hasMessages = messages.length > 0;
   const animatedPlaceholder = useAnimatedPlaceholder(EXAMPLE_QUERIES, !hasMessages && !input);
+  const selectedBackend = modelBackends.find((backend) => backend.id === selectedBackendId);
+  const selectedBackendVersion = formatBackendVersion(selectedBackend);
 
   const fetchBalance = useCallback(async () => {
     if (!user) return;
@@ -193,8 +219,16 @@ export default function ChatPage() {
   };
 
   useEffect(() => {
-    apiRequest<{ policyengine_uk_compiled: string }>("GET", "version")
-      .then((v) => setModelVersion(v.policyengine_uk_compiled))
+    apiRequest<ModelBackendsResponse>("GET", "chat/backends")
+      .then((data) => {
+        const options = Object.values(data.backends);
+        const stored = window.localStorage.getItem("policyengine-chat-backend");
+        const nextBackend = stored && options.some((backend) => backend.id === stored)
+          ? stored
+          : data.default;
+        setModelBackends(options);
+        setSelectedBackendId(nextBackend);
+      })
       .catch(() => {});
     // Refresh balance after Stripe redirect
     if (typeof window !== "undefined" && new URLSearchParams(window.location.search).get("topup") === "success") {
@@ -302,7 +336,9 @@ export default function ChatPage() {
         return [{ id: saved.id, session_id: sid, title, created_at: saved.created_at, updated_at: saved.updated_at }, ...filtered];
       });
       return saved;
-    } catch (e) { console.error("Failed to save conversation", e); }
+    } catch {
+      return null;
+    }
     return null;
   }, [user]);
 
@@ -419,7 +455,13 @@ export default function ChatPage() {
       const response = await fetch(getBackendEndpoint("chat/message"), {
         method: "POST",
         headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ messages: apiMessages, session_id: sessionId.current, user_id: user?.id || null, plan_mode: planMode }),
+        body: JSON.stringify({
+          messages: apiMessages,
+          session_id: sessionId.current,
+          user_id: user?.id || null,
+          model_backend: selectedBackendId,
+          plan_mode: planMode,
+        }),
         signal: controller.signal,
       });
       if (response.status === 402) {
@@ -545,6 +587,11 @@ export default function ChatPage() {
 
   const stopStreaming = () => { abortRef.current?.abort(); };
 
+  const handleBackendChange = (backendId: string) => {
+    setSelectedBackendId(backendId);
+    window.localStorage.setItem("policyengine-chat-backend", backendId);
+  };
+
   const handleKeyDown = (e: React.KeyboardEvent) => {
     if (e.key === "Enter" && !e.shiftKey) { e.preventDefault(); sendMessage(); }
   };
@@ -979,36 +1026,72 @@ export default function ChatPage() {
                 />
               </div>
             </div>
-            <div style={{ marginTop: "14px", display: "flex", justifyContent: "space-between", alignItems: "center", gap: "12px", flexWrap: "wrap" }}>
-              <button
-                type="button"
-                onClick={() => setPlanMode((v) => !v)}
-                disabled={isStreaming}
-                title={planMode
-                  ? "Plan mode on — the next message will get clarifying questions before the agent runs anything."
-                  : "Plan mode off — turn on to have the agent ask 1–3 clarifying questions before answering."}
-                style={{
-                  display: "flex", alignItems: "center", gap: "6px",
-                  padding: "4px 10px",
-                  background: planMode ? THEME.primary : "transparent",
-                  color: planMode ? "#fff" : "#6b7280",
-                  border: `1px solid ${planMode ? THEME.primary : "#e5e7eb"}`,
-                  fontSize: "11px",
-                  fontFamily: "inherit",
-                  cursor: isStreaming ? "not-allowed" : "pointer",
-                  fontWeight: 500,
-                  opacity: isStreaming ? 0.5 : 1,
-                  transition: "background 120ms, color 120ms, border-color 120ms",
-                }}
-              >
-                <IconBulb size={12} /> Plan mode {planMode ? "on" : "off"}
-              </button>
-              {!hasMessages && (
-                <div style={{ display: "flex", gap: "16px", alignItems: "center", color: "#b5b1a9", fontSize: "12px" }}>
-                  <span>Press Enter to send · Shift+Enter for new line</span>
-                  {modelVersion && <span style={{ fontSize: "11px", color: "#d1cdc4" }}>policyengine-uk v{modelVersion}</span>}
-                </div>
-              )}
+            <div style={{ marginTop: "14px", color: "#b5b1a9", fontSize: "12px", display: "flex", justifyContent: "space-between", alignItems: "center", gap: "12px", flexWrap: "wrap" }}>
+              <div style={{ display: "flex", alignItems: "center", gap: "12px", flexWrap: "wrap" }}>
+                <button
+                  type="button"
+                  onClick={() => setPlanMode((value) => !value)}
+                  disabled={isStreaming}
+                  title={planMode
+                    ? "Plan mode on - the next message will get clarifying questions before the agent runs anything."
+                    : "Plan mode off - turn on to have the agent ask 1-3 clarifying questions before answering."}
+                  style={{
+                    display: "inline-flex",
+                    alignItems: "center",
+                    gap: "6px",
+                    padding: "4px 10px",
+                    background: planMode ? THEME.primary : "transparent",
+                    color: planMode ? "#fff" : "#6b7280",
+                    border: `1px solid ${planMode ? THEME.primary : "#e5e7eb"}`,
+                    fontSize: "11px",
+                    fontFamily: "inherit",
+                    cursor: isStreaming ? "not-allowed" : "pointer",
+                    fontWeight: 500,
+                    opacity: isStreaming ? 0.5 : 1,
+                    transition: "background 120ms, color 120ms, border-color 120ms",
+                  }}
+                >
+                  <IconBulb size={12} /> Plan mode {planMode ? "on" : "off"}
+                </button>
+                {!hasMessages && <span>Press Enter to send · Shift+Enter for new line</span>}
+              </div>
+              <div style={{ display: "flex", flexDirection: "column", alignItems: "flex-end", gap: "6px" }}>
+                {modelBackends.length > 1 && (
+                  <div style={{ display: "inline-flex", alignItems: "center", gap: "4px", color: "#b5b1a9", fontSize: "11px" }}>
+                    <span style={{ color: "#d1cdc4" }}>Engine</span>
+                    <div style={{ display: "inline-flex", alignItems: "center", padding: "2px", border: `1px solid ${THEME.border}`, background: "#fbfaf8" }}>
+                      {modelBackends.map((backend) => {
+                        const selected = backend.id === selectedBackendId;
+                        return (
+                          <button
+                            key={backend.id}
+                            type="button"
+                            onClick={() => handleBackendChange(backend.id)}
+                            disabled={isStreaming || selected}
+                            title={backend.display_name}
+                            style={{
+                              border: "none",
+                              background: selected ? "#fff" : "transparent",
+                              color: selected ? THEME.primary : "#9e9a90",
+                              cursor: isStreaming || selected ? "default" : "pointer",
+                              fontFamily: "inherit",
+                              fontSize: "11px",
+                              fontWeight: selected ? 500 : 400,
+                              lineHeight: 1,
+                              padding: "5px 7px",
+                              boxShadow: selected ? "0 1px 2px rgba(28, 26, 23, 0.08)" : "none",
+                              opacity: isStreaming && !selected ? 0.5 : 1,
+                            }}
+                          >
+                            {formatBackendLabel(backend)}
+                          </button>
+                        );
+                      })}
+                    </div>
+                  </div>
+                )}
+                {selectedBackendVersion && <span style={{ fontSize: "11px", color: "#d1cdc4", lineHeight: 1 }}>{selectedBackendVersion}</span>}
+              </div>
             </div>
           </div>
         </div>
diff --git a/frontend/src/app/layout.tsx b/frontend/src/app/layout.tsx
index 6d41929..a3ed6a1 100644
--- a/frontend/src/app/layout.tsx
+++ b/frontend/src/app/layout.tsx
@@ -1,7 +1,10 @@
 import type { Metadata } from "next";
+import Script from "next/script";
 import "@mantine/core/styles.css";
 import Providers from "./Providers";
 
+const GA_MEASUREMENT_ID = "G-2YHG89FY0N";
+
 export const metadata: Metadata = {
   title: "PolicyEngine UK",
   description: "UK tax and benefit microsimulation assistant",
@@ -12,6 +15,19 @@ export default function RootLayout({ children }: { children: React.ReactNode })
   return (
     <html lang="en">
       <head>
+        <Script
+          src={`https://www.googletagmanager.com/gtag/js?id=${GA_MEASUREMENT_ID}`}
+          strategy="afterInteractive"
+        />
+        <Script id="google-analytics" strategy="afterInteractive">
+          {`
+            window.dataLayer = window.dataLayer || [];
+            function gtag(){dataLayer.push(arguments);}
+            window.gtag = gtag;
+            gtag('js', new Date());
+            gtag('config', '${GA_MEASUREMENT_ID}');
+          `}
+        </Script>
         <link rel="preconnect" href="https://fonts.googleapis.com" />
         <link rel="preconnect" href="https://fonts.gstatic.com" crossOrigin="anonymous" />
         <link href="https://fonts.googleapis.com/css2?family=Source+Serif+4:ital,opsz,wght@0,8..60,300;0,8..60,400;0,8..60,600;1,8..60,400&family=Newsreader:ital,opsz,wght@0,6..72,300;0,6..72,400;0,6..72,500;1,6..72,400&display=swap" rel="stylesheet" />
diff --git a/modal_app.py b/modal_app.py
index 5b91306..884afef 100644
--- a/modal_app.py
+++ b/modal_app.py
@@ -25,21 +25,7 @@ def _preload_engine():
 image = (
     modal.Image.debian_slim(python_version="3.13")
     .apt_install("libpq-dev", "gcc")
-    .pip_install(
-        "fastapi",
-        "uvicorn[standard]",
-        "sqlmodel",
-        "psycopg2-binary",
-        "anthropic",
-        "pydantic-ai[anthropic]",
-        "policyengine-uk-compiled>=0.20.0",
-        "policyengine_uk>=2.75.0",
-        "pandas",
-        "httpx",
-        "supabase",
-        "stripe",
-        "python-dateutil",
-    )
+    .pip_install_from_requirements("backend/requirements.txt")
     .run_function(_preload_engine)
     .add_local_dir("backend", remote_path="/app/backend", copy=True)
     # Regenerate reference.md against the Modal-installed