diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..01b3fa8
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,10 @@
+# Copilot Instructions
+
+Follow the repository's canonical engineering guidance under
+`docs/engineering/skills/`.
+
+For tests, read `docs/engineering/skills/testing.md` before adding, moving, or
+reviewing test files.
+
+For changes to the chat model pathway, prompts, tools, or calculation
+boundaries, read `docs/engineering/skills/uk-chat-runtime.md`.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..4cfc466
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,53 @@
+name: Tests
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  backend:
+    name: Backend pytest
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.13"
+
+      - name: Install backend dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r backend/requirements.txt pytest
+
+      - name: Run backend tests
+        run: make test-backend
+
+  frontend:
+    name: Frontend build
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set up Node
+        uses: actions/setup-node@v6
+        with:
+          node-version: "20"
+          cache: npm
+          cache-dependency-path: frontend/package-lock.json
+
+      - name: Install frontend dependencies
+        working-directory: frontend
+        run: npm ci
+
+      - name: Build frontend
+        working-directory: frontend
+        env:
+          NEXT_TELEMETRY_DISABLED: "1"
+        run: npm run build
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..1723003
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,17 @@
+# Agent Instructions
+
+These instructions apply repository-wide.
+
+Canonical AI-facing engineering guidance lives under `docs/engineering/skills/`.
+Use those files as the source of truth across Codex, Claude, Copilot, and other
+AI tools.
+
+When adding, moving, or reviewing tests, read
+`docs/engineering/skills/testing.md`.
+
+When changing the chat model pathway, system prompts, tool definitions, or
+calculation boundaries, read `docs/engineering/skills/uk-chat-runtime.md`.
+
+Keep this file thin. Do not duplicate durable engineering guidance here; update
+the canonical docs first, then adjust this adapter only when an entry point
+needs to point at new guidance.
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..071079e
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,13 @@
+# Claude Instructions
+
+These instructions apply repository-wide.
+
+Repository-wide AI-facing engineering guidance lives under
+`docs/engineering/skills/`. This file is a Claude adapter and should stay thin;
+do not duplicate detailed testing, runtime, prompt, or architecture rules here.
+
+Before adding, moving, or reviewing tests, read
+`docs/engineering/skills/testing.md`.
+
+Before changing the UK chat model pathway, system prompts, tool definitions, or
+calculation boundaries, read `docs/engineering/skills/uk-chat-runtime.md`.
diff --git a/Makefile b/Makefile
index 16c12f3..41592e2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: up down build logs restart shell-backend shell-frontend
+.PHONY: up down build logs restart shell-backend shell-frontend test test-backend test-frontend
 
 # Start all services in dev mode (live reload)
 up:
@@ -42,3 +42,12 @@ shell-frontend:
 # One-time setup: copy .env.example to .env
 init:
 	@if [ ! -f .env ]; then cp .env.example .env && echo "Created .env — fill in your ANTHROPIC_API_KEY"; else echo ".env already exists"; fi
+
+# Run the same checks used by PR CI, assuming dependencies are already installed.
+test: test-backend test-frontend
+
+test-backend:
+	PYTHONPATH=backend python -m pytest backend/tests
+
+test-frontend:
+	cd frontend && npm run build
diff --git a/backend/agent_tools.py b/backend/agent_tools.py
index d521102..e0c6392 100644
--- a/backend/agent_tools.py
+++ b/backend/agent_tools.py
@@ -710,7 +710,7 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
 TOOL_DEFINITIONS = [
     {
         "name": "run_python",
-        "description": "Execute reproducible Python code using the official PolicyEngine UK compiled interface. The environment preloads `policyengine_uk_compiled` as `pe`, plus `Simulation`, `Parameters`, `StructuralReform`, `aggregate_microdata`, `combine_microdata`, `capabilities`, `ensure_dataset`, `pd`, `np`, `json`, and `math`. Assign the final answer to `result` and use `print()` for intermediate output.",
+        "description": "Execute reproducible Python code using the official PolicyEngine UK compiled interface. The environment preloads `policyengine_uk_compiled` as `pe`, plus `Simulation`, `Parameters`, `StructuralReform`, `aggregate_microdata`, `combine_microdata`, `capabilities`, `ensure_dataset`, `pd`, `np`, `json`, and `math`. Assign the final answer to `result` and use `print()` for intermediate output. Do not inspect or return row-level survey microdata. For household examples, create illustrative synthetic households, prefer `Simulation.single_person()` for single-person examples, and label them as illustrative rather than real households.",
         "input_schema": {
             "type": "object",
             "properties": {
@@ -725,6 +725,8 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
             "Generate a chart JSON block for the frontend to render. "
             "Use this for visualisations such as income distributions, marginal-rate or tax-schedule curves, "
             "decile impact comparisons, and trends over time or income. "
+            "Use factually neutral titles, subtitles, labels, and captions; do not call policies good, bad, fair, unfair, "
+            "regressive, progressive, generous, or punitive. "
             "The tool returns a `chart_markdown` field containing a ```chart fenced JSON block — you MUST paste that "
             "string verbatim into your next text response, otherwise the chart will not appear to the user. "
             "Do not attempt to render charts with matplotlib inside `run_python`; the UI cannot display matplotlib output. "
@@ -738,7 +740,7 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]:
                     "enum": ["line", "bar", "area", "scatter"],
                     "description": "Chart type. Use `line` for schedules/curves over a continuous x, `bar` for category comparisons (e.g. deciles), `area` for stacked compositions, `scatter` for point clouds.",
                 },
-                "title": {"type": "string", "description": "Chart title shown above the plot."},
+                "title": {"type": "string", "description": "Factually neutral chart title shown above the plot."},
                 "data": {
                     "type": "array",
                     "description": "List of row objects. Each row must contain the `x_field` key and every key listed in `y_fields`.",
diff --git a/backend/prompts.py b/backend/prompts.py
new file mode 100644
index 0000000..0833af0
--- /dev/null
+++ b/backend/prompts.py
@@ -0,0 +1,197 @@
+"""Prompt text used by the UK chat backend.
+
+Keep these constants declarative: routes should assemble blocks and call
+models, while this module owns model-facing instructions.
+
+For model-neutral engineering guidance around this runtime pathway, see
+`docs/engineering/skills/uk-chat-runtime.md`.
+"""
+
+ROLE_AND_TASK = """
+You are an expert policy analysis assistant for a UK microsimulation platform.
+You help users understand and analyse UK tax and benefit policy using
+reproducible Python code.
+"""
+
+PYTHON_COMPUTATION_RULES = """
+CRITICAL - ALWAYS COMPUTE WITH PYTHON:
+- Never answer quantitative policy questions from memory.
+- You have one execution tool: `run_python`.
+- Use `run_python` for every tax, benefit, reform, schedule, poverty, decile,
+  and distributional question.
+- Every number in your answer must come directly from the Python result you
+  just computed.
+"""
+
+MODEL_INSTRUCTIONS_RULES = """
+CRITICAL - START BY READING THE MODEL INSTRUCTIONS:
+- At the start of a new line of analysis, use Python to inspect
+  `capabilities()`.
+- Use that to ground yourself in the available datasets, years, programmes,
+  and caveats before you simulate.
+- If the user asks about something outside the modelled scope, say so clearly
+  instead of guessing.
+"""
+
+OFFICIAL_INTERFACE_RULES = """
+CRITICAL - USE THE OFFICIAL POLICYENGINE PYTHON INTERFACE:
+- The Python environment preloads:
+  `policyengine_uk_compiled` as `pe`
+  `Simulation`
+  `Parameters`
+  `StructuralReform`
+  `aggregate_microdata`
+  `combine_microdata`
+  `capabilities`
+  `ensure_dataset`
+  `pd`, `np`, `json`, `math`
+- Prefer writing code directly against those objects so the run is
+  reproducible outside chat.
+- Do not recreate policy logic manually if the package already provides it.
+"""
+
+REPRODUCIBILITY_RULES = """
+REPRODUCIBILITY RULES:
+- Write clear Python that another developer could copy and run.
+- Prefer one substantial `run_python` call over many tiny ones.
+- Put the important output into `result`.
+- Use `print()` only for short diagnostics.
+- Do not rely on hidden reasoning for calculations when code can do the work.
+"""
+
+MICRODATA_PRIVACY_RULES = """
+MICRODATA PRIVACY AND ILLUSTRATIVE HOUSEHOLDS:
+- Do not access, display, quote, or imply access to row-level survey microdata
+  or real households.
+- Use aggregate microdata interfaces only for aggregate outputs; do not inspect
+  or return individual survey rows as examples.
+- If the user asks how individual households are constructed in the data, what
+  households in the data look like, or for examples of actual household records,
+  explain that this app cannot access or disclose real households.
+- For household examples, construct illustrative synthetic households with the
+  public `Simulation` API. Prefer `Simulation.single_person()` when a
+  single-person example fits the question.
+- Always label these households as illustrative, synthetic, or hypothetical,
+  not actual households from the data.
+"""
+
+API_AND_DATASET_RULES = """
+API AND DATASETS:
+- A live API reference (docstrings, `capabilities()` snapshot, full
+  `Parameters` JSON schema) is attached to this system prompt - consult it for
+  signatures, reform keys, and dataset descriptions rather than guessing.
+- Call `capabilities()` at the start of a new line of analysis to check what's
+  modelled and locally available before committing to an approach.
+- Tell the user which dataset you used when it matters.
+- If something is not modelled well enough for a quantitative answer, say so
+  clearly and do not fabricate estimates.
+"""
+
+ANALYTICAL_NOTES = """
+ANALYTICAL NOTES:
+- Decile impacts are decile-level averages, not economy-wide means.
+- Poverty outputs are already percentage rates, not decimal shares.
+- If a result is counterintuitive, explain the mechanism briefly.
+- Use British English.
+"""
+
+NEUTRALITY_RULES = """
+FACTUAL NEUTRALITY:
+- Be factually neutral.
+- Do not describe UK tax or benefit choices as good, bad, fair, unfair,
+  regressive, progressive, generous, punitive, or similar.
+- Stick to mechanics and quantified effects.
+- Describe who pays or receives more or less, by how much, over what period,
+  and under which dataset, year, and assumptions.
+- If a distributional pattern matters, describe the measured direction
+  directly rather than applying value labels.
+- Do not make policy recommendations unless the user explicitly asks for policy
+  design options. Even then, frame tradeoffs neutrally.
+"""
+
+USER_FACING_STYLE = """
+USER-FACING STYLE:
+- Prefer plain English in the prose answer.
+- Avoid exposing internal parameter keys unless the user wants code-level
+  detail.
+- Keep the answer grounded in what the Python run actually showed.
+- Do not paste the full Python into the main answer unless the user asks; the
+  UI will show the executed code separately.
+"""
+
+CHART_RULES = """
+CHARTS:
+- When a visualisation would help (distributions, marginal-rate or tax-schedule
+  curves, decile comparisons, trends), call the `generate_chart` tool after you
+  have the data from `run_python`.
+- The tool returns a `chart_markdown` field containing a ```chart fenced JSON
+  block. Paste that block VERBATIM into your next text response - the frontend
+  parses it to render the chart. If you do not include it, no chart will
+  appear.
+- Use factually neutral chart titles, subtitles, labels, and captions.
+- Do not try to draw charts with matplotlib inside `run_python`; matplotlib
+  output is discarded by the UI.
+- Use the `*_format` arguments (e.g. `y_format="currency"`,
+  `x_format="percent"`) so axis ticks and tooltips are formatted correctly.
+"""
+
+SYSTEM_PROMPT_SECTIONS = (
+    ROLE_AND_TASK,
+    PYTHON_COMPUTATION_RULES,
+    MODEL_INSTRUCTIONS_RULES,
+    OFFICIAL_INTERFACE_RULES,
+    REPRODUCIBILITY_RULES,
+    MICRODATA_PRIVACY_RULES,
+    API_AND_DATASET_RULES,
+    ANALYTICAL_NOTES,
+    NEUTRALITY_RULES,
+    USER_FACING_STYLE,
+    CHART_RULES,
+)
+
+SYSTEM_PROMPT = "\n\n".join(section.strip() for section in SYSTEM_PROMPT_SECTIONS)
+
+PLAN_MODE_DIRECTIVE = """
+PLAN MODE IS ACTIVE FOR THIS TURN:
+- Do NOT call any tools.
+- Identify 1-3 specific ambiguities in the user's question (e.g. which year,
+  dataset, reform parameters, metric, comparison baseline, population subset).
+- Ask those 1-3 questions concisely as a numbered list. No preamble beyond one
+  short lead-in sentence.
+- If the question is fully unambiguous, confirm your understanding in one
+  sentence and offer to proceed - still do not call tools.
+- You will continue without plan mode on the next turn once the user replies.
+""".strip()
+
+CHARTS_MODE_DIRECTIVE = """
+The user has enabled chart mode. When the question's answer would benefit from a
+visualization (distributions, comparisons across categories, trends over time,
+marginal-rate curves, decile/percentile breakdowns), prefer to include a chart
+using the available chart tools alongside your written explanation. Do not force
+charts on questions that are not chartable (e.g. definitional, yes/no, or
+single-number lookups) - this is a preference, not a requirement.
+""".strip()
+
+SUGGESTION_SYSTEM = (
+    "You suggest follow-up questions for a UK tax and benefit policy chatbot. "
+    "Given the latest user question and the assistant's answer, propose 2-3 short, "
+    "specific follow-ups the user is likely to want next (a comparison, a slice by "
+    "region or decile, a different reform, a chart request, an alternative dataset, "
+    "etc.). Each question must be under 80 characters, phrased as the user would "
+    "type it, in British English, with no numbering or trailing punctuation beyond "
+    "a question mark. Use neutral, descriptive wording; do not call policies good, "
+    "bad, fair, unfair, regressive, progressive, generous, or punitive. Respond "
+    "ONLY with a JSON object of the form "
+    '{"suggestions": ["...", "..."]} - no prose, no code fences.'
+)
+
+TITLE_SYSTEM = (
+    "You are titling conversations from a UK tax and benefit policy assistant. "
+    "Generate a very short title (4-6 words) that accurately describes the policy "
+    "question being asked. Use UK policy terminology (e.g. 'marginal tax rate' not "
+    "'MTR', 'National Insurance' not 'NI', 'Income Support' not 'IS'). Use neutral, "
+    "descriptive wording; do not call policies good, bad, fair, unfair, regressive, "
+    "progressive, generous, or punitive. Use sentence case (capitalise only the "
+    "first word and proper nouns). Output only the title with no punctuation, "
+    "quotes, or explanation."
+)
diff --git a/backend/routes/chatbot.py b/backend/routes/chatbot.py
index 7bd96e2..47ab22b 100644
--- a/backend/routes/chatbot.py
+++ b/backend/routes/chatbot.py
@@ -18,74 +18,19 @@
 from pydantic_ai.settings import ModelSettings
 
 from agent_tools import execute_tool, TOOL_DEFINITIONS
+from prompts import (
+    CHARTS_MODE_DIRECTIVE,
+    PLAN_MODE_DIRECTIVE,
+    SUGGESTION_SYSTEM,
+    SYSTEM_PROMPT,
+    TITLE_SYSTEM,
+)
 from rate_limit import limiter, chat_key_func, CHAT_USER_LIMIT, CHAT_IP_LIMIT
 
 logger = logging.getLogger(__name__)
 
 router = APIRouter(prefix="/chat", tags=["chatbot"])
 
-# ---------------------------------------------------------------------------
-# System prompt
-# ---------------------------------------------------------------------------
-SYSTEM_PROMPT = """You are an expert policy analysis assistant for a UK microsimulation platform. You help users understand and analyse UK tax and benefit policy using reproducible Python code.
-
-CRITICAL - ALWAYS COMPUTE WITH PYTHON:
-- Never answer quantitative policy questions from memory.
-- You have one execution tool: `run_python`.
-- Use `run_python` for every tax, benefit, reform, schedule, poverty, decile, and distributional question.
-- Every number in your answer must come directly from the Python result you just computed.
-
-CRITICAL - START BY READING THE MODEL INSTRUCTIONS:
-- At the start of a new line of analysis, use Python to inspect `capabilities()`.
-- Use that to ground yourself in the available datasets, years, programmes, and caveats before you simulate.
-- If the user asks about something outside the modelled scope, say so clearly instead of guessing.
-
-CRITICAL - USE THE OFFICIAL POLICYENGINE PYTHON INTERFACE:
-- The Python environment preloads:
-  `policyengine_uk_compiled` as `pe`
-  `Simulation`
-  `Parameters`
-  `StructuralReform`
-  `aggregate_microdata`
-  `combine_microdata`
-  `capabilities`
-  `ensure_dataset`
-  `pd`, `np`, `json`, `math`
-- Prefer writing code directly against those objects so the run is reproducible outside chat.
-- Do not recreate policy logic manually if the package already provides it.
-
-REPRODUCIBILITY RULES:
-- Write clear Python that another developer could copy and run.
-- Prefer one substantial `run_python` call over many tiny ones.
-- Put the important output into `result`.
-- Use `print()` only for short diagnostics.
-- Do not rely on hidden reasoning for calculations when code can do the work.
-
-API AND DATASETS:
-- A live API reference (docstrings, `capabilities()` snapshot, full `Parameters` JSON schema) is attached to this system prompt — consult it for signatures, reform keys, and dataset descriptions rather than guessing.
-- Call `capabilities()` at the start of a new line of analysis to check what's modelled and locally available before committing to an approach.
-- Tell the user which dataset you used when it matters.
-- If something is not modelled well enough for a quantitative answer, say so clearly and do not fabricate estimates.
-
-ANALYTICAL NOTES:
-- Decile impacts are decile-level averages, not economy-wide means.
-- Poverty outputs are already percentage rates, not decimal shares.
-- If a result is counterintuitive, explain the mechanism briefly.
-- Stay analytically neutral and use British English.
-
-USER-FACING STYLE:
-- Prefer plain English in the prose answer.
-- Avoid exposing internal parameter keys unless the user wants code-level detail.
-- Keep the answer grounded in what the Python run actually showed.
-- Do not paste the full Python into the main answer unless the user asks; the UI will show the executed code separately.
-
-CHARTS:
-- When a visualisation would help (distributions, marginal-rate or tax-schedule curves, decile comparisons, trends), call the `generate_chart` tool after you have the data from `run_python`.
-- The tool returns a `chart_markdown` field containing a ```chart fenced JSON block. Paste that block VERBATIM into your next text response — the frontend parses it to render the chart. If you do not include it, no chart will appear.
-- Do not try to draw charts with matplotlib inside `run_python`; matplotlib output is discarded by the UI.
-- Use the `*_format` arguments (e.g. `y_format="currency"`, `x_format="percent"`) so axis ticks and tooltips are formatted correctly.
-"""
-
 
 # ---------------------------------------------------------------------------
 # Pydantic-AI agent setup
@@ -220,21 +165,6 @@ class ChatRequest(BaseModel):
     image_media_type: str | None = None
 
 
-PLAN_MODE_DIRECTIVE = """
-PLAN MODE IS ACTIVE FOR THIS TURN:
-- Do NOT call any tools.
-- Identify 1–3 specific ambiguities in the user's question (e.g. which year, dataset, reform parameters, metric, comparison baseline, population subset).
-- Ask those 1–3 questions concisely as a numbered list. No preamble beyond one short lead-in sentence.
-- If the question is fully unambiguous, confirm your understanding in one sentence and offer to proceed — still do not call tools.
-- You will continue without plan mode on the next turn once the user replies.
-""".strip()
-
-
-CHARTS_MODE_DIRECTIVE = """
-The user has enabled chart mode. When the question's answer would benefit from a visualization (distributions, comparisons across categories, trends over time, marginal-rate curves, decile/percentile breakdowns), prefer to include a chart using the available chart tools alongside your written explanation. Do not force charts on questions that are not chartable (e.g. definitional, yes/no, or single-number lookups) — this is a preference, not a requirement.
-""".strip()
-
-
 class TitleRequest(BaseModel):
     first_user_message: str
     first_assistant_message: str | None = None
@@ -244,18 +174,6 @@ class TitleRequest(BaseModel):
 # Follow-up suggestion chips
 # ---------------------------------------------------------------------------
 
-_SUGGESTION_SYSTEM = (
-    "You suggest follow-up questions for a UK tax and benefit policy chatbot. "
-    "Given the latest user question and the assistant's answer, propose 2–3 short, "
-    "specific follow-ups the user is likely to want next (a comparison, a slice by "
-    "region or decile, a different reform, a chart request, an alternative dataset, "
-    "etc.). Each question must be under 80 characters, phrased as the user would "
-    "type it, in British English, with no numbering or trailing punctuation beyond "
-    "a question mark. Respond ONLY with a JSON object of the form "
-    '{"suggestions": ["...", "..."]} — no prose, no code fences.'
-)
-
-
 async def _generate_followup_suggestions(
     last_user_message: str,
     assistant_answer: str,
@@ -283,7 +201,7 @@ async def _generate_followup_suggestions(
             client.messages.create(
                 model=SUGGESTION_MODEL,
                 max_tokens=200,
-                system=_SUGGESTION_SYSTEM,
+                system=SUGGESTION_SYSTEM,
                 messages=[{"role": "user", "content": user_block}],
             ),
             timeout=SUGGESTION_TIMEOUT_SECS,
@@ -339,13 +257,7 @@ def generate_title(request: TitleRequest):
     response = client.messages.create(
         model=TITLE_MODEL,
         max_tokens=32,
-        system=(
-            "You are titling conversations from a UK tax and benefit policy assistant. "
-            "Generate a very short title (4–6 words) that accurately describes the policy question being asked. "
-            "Use UK policy terminology (e.g. 'marginal tax rate' not 'MTR', 'National Insurance' not 'NI', 'Income Support' not 'IS'). "
-            "Use sentence case (capitalise only the first word and proper nouns). "
-            "Output only the title with no punctuation, quotes, or explanation."
-        ),
+        system=TITLE_SYSTEM,
         messages=[{"role": "user", "content": content}],
     )
     return {"title": response.content[0].text.strip()}
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
index 2f37260..1c4fc3f 100644
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -8,6 +8,9 @@
 """
 
 import os
+from pathlib import Path
+
+import pytest
 
 # Bump rate limits well above test workload. Real production limits are
 # 5/min and 60/hour for chat — these are intentionally absurd so the
@@ -15,3 +18,30 @@
 os.environ.setdefault("RATE_LIMIT_CHAT_PER_MIN", "10000")
 os.environ.setdefault("RATE_LIMIT_CHAT_PER_HOUR", "100000")
 os.environ.setdefault("RATE_LIMIT_CHAT_IP_PER_MIN", "10000")
+
+if not os.environ.get("DATABASE_URL"):
+    test_db = Path(os.environ.get("PYTEST_SQLITE_DB", "/tmp/policyengine_uk_chat_tests.sqlite"))
+    try:
+        test_db.unlink()
+    except FileNotFoundError:
+        pass
+    os.environ["DATABASE_URL"] = f"sqlite:///{test_db}"
+
+
+@pytest.fixture
+def isolated_conversations_table(tmp_path, monkeypatch):
+    """Give every test a fresh conversations table without touching Postgres."""
+    from sqlmodel import SQLModel, create_engine
+    import routes.conversations as conversations
+
+    engine = create_engine(
+        f"sqlite:///{tmp_path / 'conversations.sqlite'}",
+        connect_args={"check_same_thread": False},
+    )
+    SQLModel.metadata.create_all(engine)
+    monkeypatch.setattr(conversations, "_engine", engine)
+
+    yield
+
+    SQLModel.metadata.drop_all(engine)
+    engine.dispose()
diff --git a/backend/tests/test_agent_tools.py b/backend/tests/test_agent_tools.py
index 0740e52..191d691 100644
--- a/backend/tests/test_agent_tools.py
+++ b/backend/tests/test_agent_tools.py
@@ -4,11 +4,14 @@
 Run inside the backend container: pytest tests/
 """
 
+import importlib.util
+import os
+
 import pytest
+
 from agent_tools import (
     get_baseline_parameters,
     calculate_household,
-    compute,
     generate_chart,
     execute_tool,
     _build_compiled_policy,
@@ -16,11 +19,31 @@
     run_python,
 )
 
+COMPILED_AVAILABLE = importlib.util.find_spec("policyengine_uk_compiled") is not None
+requires_compiled = pytest.mark.skipif(
+    os.environ.get("CI") != "true" and not COMPILED_AVAILABLE,
+    reason="policyengine_uk_compiled is not installed",
+)
+
+
+# ---------------------------------------------------------------------------
+# policyengine_uk_compiled interface
+# ---------------------------------------------------------------------------
+
+@requires_compiled
+class TestCompiledInterface:
+    def test_simulation_single_person_available(self):
+        from policyengine_uk_compiled import Simulation
+
+        assert hasattr(Simulation, "single_person")
+        assert callable(Simulation.single_person)
+
 
 # ---------------------------------------------------------------------------
 # get_baseline_parameters
 # ---------------------------------------------------------------------------
 
+@requires_compiled
 class TestGetBaselineParameters:
     def test_returns_parameters(self):
         result = get_baseline_parameters(year=2023)
@@ -48,6 +71,7 @@ def test_invalid_year_returns_error(self):
     year=2023,
 )
 
+@requires_compiled
 class TestCalculateHousehold:
     def test_basic_household(self):
         result = calculate_household(**SINGLE_ADULT)
@@ -180,56 +204,6 @@ def test_multi_person_household_with_children(self):
         assert adult["baseline_income_tax"] > 0
 
 
-# ---------------------------------------------------------------------------
-# compute
-# ---------------------------------------------------------------------------
-
-class TestCompute:
-    def test_diff(self):
-        result = compute("diff", [1, 3, 6, 10])
-        assert result["result"] == [2, 3, 4]
-
-    def test_pct_change(self):
-        result = compute("pct_change", [100, 110])
-        assert abs(result["result"][0] - 10.0) < 0.001
-
-    def test_mean(self):
-        result = compute("mean", [1, 2, 3, 4, 5])
-        assert result["result"] == 3.0
-
-    def test_sum(self):
-        result = compute("sum", [10, 20, 30])
-        assert result["result"] == 60
-
-    def test_marginal_rate(self):
-        # net incomes at £10k steps, gross incomes
-        net = [8000, 14800, 21600]
-        gross = [10000, 20000, 30000]
-        result = compute("marginal_rate", net, gross)
-        # (14800-8000)/(20000-10000)*100 = 68%
-        assert abs(result["result"][0] - 68.0) < 0.01
-
-    def test_subtract(self):
-        result = compute("subtract", [10, 20, 30], [1, 2, 3])
-        assert result["result"] == [9, 18, 27]
-
-    def test_divide_by_zero(self):
-        result = compute("divide", [10, 20], [0, 4])
-        assert result["result"][0] == 0  # safe division
-
-    def test_empty_data(self):
-        result = compute("sum", [])
-        assert "error" in result
-
-    def test_unknown_operation(self):
-        result = compute("nonexistent", [1, 2, 3])
-        assert "error" in result
-
-    def test_mismatched_lengths(self):
-        result = compute("subtract", [1, 2, 3], [1, 2])
-        assert "error" in result
-
-
 # ---------------------------------------------------------------------------
 # generate_chart
 # ---------------------------------------------------------------------------
@@ -261,13 +235,41 @@ def test_chart_markdown_format(self):
         assert result["chart_markdown"].startswith("```chart\n")
         assert result["chart_markdown"].endswith("\n```")
 
+    def test_with_formats(self):
+        data = [{"income": i * 10000, "tax": i * 2000} for i in range(10)]
+        result = generate_chart("line", "Tax schedule", data, "income", ["tax"], x_format="currency", y_format="currency")
+        assert result["status"] == "success"
+
 
 class DummyModelDump:
     def model_dump(self):
         return {"child_benefit": 123}
 
 
+class TestJsonSafe:
+    def test_serialises_simulation_like_objects(self):
+        serialised = _json_safe({"result": DummyModelDump()})
+        assert serialised == {"result": {"child_benefit": 123}}
+
+
+@requires_compiled
 class TestRunPython:
+    def test_replaces_old_compute_sum_use_case(self):
+        result = run_python("data = [10, 20, 30]\nresult = sum(data)")
+        assert result["result"] == 60
+
+    def test_replaces_old_compute_marginal_rate_use_case(self):
+        code = """
+net = [8000, 14800, 21600]
+gross = [10000, 20000, 30000]
+result = [
+    (net[i + 1] - net[i]) / (gross[i + 1] - gross[i]) * 100
+    for i in range(len(net) - 1)
+]
+"""
+        result = run_python(code)
+        assert abs(result["result"][0] - 68.0) < 0.01
+
     def test_supports_basic_introspection(self):
         result = run_python("value = {'a': 1}\nresult = {'type': type(value).__name__, 'dir_has_keys': 'keys' in dir(value)}")
         assert result["result"] == {"type": "dict", "dir_has_keys": True}
@@ -276,20 +278,12 @@ def test_supports_safe_imports(self):
         result = run_python("import json\nresult = json.loads('{\"ok\": true}')")
         assert result["result"] == {"ok": True}
 
-    def test_serialises_simulation_like_objects(self):
-        serialised = _json_safe({"result": DummyModelDump()})
-        assert serialised == {"result": {"child_benefit": 123}}
-
-    def test_with_formats(self):
-        data = [{"income": i * 10000, "tax": i * 2000} for i in range(10)]
-        result = generate_chart("line", "Tax schedule", data, "income", ["tax"], x_format="currency", y_format="currency")
-        assert result["status"] == "success"
-
 
 # ---------------------------------------------------------------------------
 # _build_compiled_policy
 # ---------------------------------------------------------------------------
 
+@requires_compiled
 class TestBuildCompiledPolicy:
     def test_none_reform_returns_none(self):
         assert _build_compiled_policy(None) is None
@@ -319,10 +313,15 @@ def test_unknown_tool(self):
         result = execute_tool("nonexistent_tool", {})
         assert "error" in result
 
-    def test_dispatches_compute(self):
-        result = execute_tool("compute", {"operation": "sum", "data": [1, 2, 3]})
+    @requires_compiled
+    def test_dispatches_run_python(self):
+        result = execute_tool("run_python", {"code": "result = sum([1, 2, 3])"})
         assert result["result"] == 6
 
+    def test_compute_is_not_exposed(self):
+        result = execute_tool("compute", {"operation": "sum", "data": [1, 2, 3]})
+        assert result["error"] == "Unknown tool: compute"
+
     def test_dispatches_generate_chart(self):
         result = execute_tool("generate_chart", {
             "chart_type": "line", "title": "T",
diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py
index 8ede926..7863b1d 100644
--- a/backend/tests/test_api.py
+++ b/backend/tests/test_api.py
@@ -5,12 +5,20 @@
 """
 
 import json
+import os
+
 import pytest
 from fastapi.testclient import TestClient
 from main import app
 
 client = TestClient(app)
 
+requires_live_anthropic = pytest.mark.skipif(
+    os.environ.get("RUN_LIVE_ANTHROPIC_TESTS") != "1"
+    or not os.environ.get("ANTHROPIC_API_KEY"),
+    reason="set RUN_LIVE_ANTHROPIC_TESTS=1 and ANTHROPIC_API_KEY to run live Anthropic tests",
+)
+
 
 # ---------------------------------------------------------------------------
 # Health
@@ -27,6 +35,7 @@ def test_health(self):
 # Conversations CRUD
 # ---------------------------------------------------------------------------
 
+@pytest.mark.usefixtures("isolated_conversations_table")
 class TestConversations:
     def _save(self, session_id="test-session-1", title="Test", messages=None, user_id=None):
         return client.post("/conversations", json={
@@ -135,6 +144,7 @@ def test_report_includes_tool_inputs_and_outputs(self):
 # Title generation
 # ---------------------------------------------------------------------------
 
+@requires_live_anthropic
 class TestTitle:
     def test_generates_title(self):
         r = client.post("/chat/title", json={
@@ -171,6 +181,7 @@ def parse_sse(response_text: str) -> list[dict]:
     return events
 
 
+@requires_live_anthropic
 class TestChatMessage:
     def test_simple_chat_returns_sse(self):
         with client.stream("POST", "/chat/message", json={
diff --git a/backend/tests/test_prompts.py b/backend/tests/test_prompts.py
new file mode 100644
index 0000000..745491d
--- /dev/null
+++ b/backend/tests/test_prompts.py
@@ -0,0 +1,76 @@
+"""Regression tests for model-facing prompt contracts."""
+
+import pytest
+
+from agent_tools import TOOL_DEFINITIONS
+from prompts import (
+    SYSTEM_PROMPT,
+    SUGGESTION_SYSTEM,
+    TITLE_SYSTEM,
+)
+
+
+def _tool(name: str) -> dict:
+    return next(tool for tool in TOOL_DEFINITIONS if tool["name"] == name)
+
+
+def test_main_prompt_contains_factual_neutrality_rules():
+    assert "Be factually neutral." in SYSTEM_PROMPT
+    assert "Stick to mechanics and quantified effects." in SYSTEM_PROMPT
+    for term in (
+        "good",
+        "bad",
+        "fair",
+        "unfair",
+        "regressive",
+        "progressive",
+        "generous",
+        "punitive",
+    ):
+        assert term in SYSTEM_PROMPT
+
+
+def test_main_prompt_contains_microdata_privacy_rules():
+    assert "row-level survey microdata" in SYSTEM_PROMPT
+    assert "real households" in SYSTEM_PROMPT
+    assert "cannot access or disclose real households" in SYSTEM_PROMPT
+    assert "illustrative synthetic households" in SYSTEM_PROMPT
+    assert "Simulation.single_person()" in SYSTEM_PROMPT
+
+
+def test_run_python_tool_repeats_microdata_contract():
+    description = _tool("run_python")["description"]
+    assert "row-level survey microdata" in description
+    assert "illustrative synthetic households" in description
+    assert "Simulation.single_person()" in description
+    assert "rather than real households" in description
+
+
+def test_generate_chart_tool_requires_neutral_titles():
+    chart_tool = _tool("generate_chart")
+    description = chart_tool["description"]
+    title_description = chart_tool["input_schema"]["properties"]["title"]["description"]
+    assert "factually neutral" in description
+    assert "factually neutral" in title_description.lower()
+
+
+def test_secondary_model_prompts_use_neutral_wording():
+    for prompt in (SUGGESTION_SYSTEM, TITLE_SYSTEM):
+        assert "neutral, descriptive wording" in prompt
+        assert "regressive" in prompt
+        assert "punitive" in prompt
+
+
+def test_system_blocks_preserve_cache_breakpoints_after_prompt_refactor():
+    pytest.importorskip("pydantic_ai")
+    pytest.importorskip("anthropic")
+
+    from routes.chatbot import _build_system_blocks
+
+    on = _build_system_blocks(plan_mode=True, charts_mode=True)
+    off = _build_system_blocks(plan_mode=False, charts_mode=False)
+    assert on[0] == off[0]
+    assert on[0]["text"] == SYSTEM_PROMPT
+    assert on[0]["cache_control"] == {"type": "ephemeral"}
+    assert "PLAN MODE IS ACTIVE" in on[-2]["text"]
+    assert "chart mode" in on[-1]["text"]
diff --git a/docs/engineering/skills/README.md b/docs/engineering/skills/README.md
new file mode 100644
index 0000000..5ca53c6
--- /dev/null
+++ b/docs/engineering/skills/README.md
@@ -0,0 +1,15 @@
+# Engineering Skills
+
+This directory is the canonical source for AI-facing engineering guidance.
+
+Tool-specific instruction files such as `AGENTS.md`, `CLAUDE.md`, and
+`.github/copilot-instructions.md` should point here instead of duplicating
+implementation-specific guidance. When a rule changes, update the skill here
+first, then keep adapters thin.
+
+Current skills:
+
+- `testing.md`: backend/frontend test lanes, fixture scope, live-model test
+  gates, and verification commands.
+- `uk-chat-runtime.md`: UK chat AI pathway, prompt ownership, exposed tools,
+  deterministic calculation boundaries, privacy, and neutrality rules.
diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md
new file mode 100644
index 0000000..8c0d8b0
--- /dev/null
+++ b/docs/engineering/skills/testing.md
@@ -0,0 +1,47 @@
+# Testing
+
+Use this skill whenever adding, moving, or reviewing tests.
+
+## Layout
+
+- Backend tests live under `backend/tests/`.
+- Frontend checks currently run through the frontend build.
+- Keep fixtures in `backend/tests/conftest.py` only when they are broadly useful
+  across backend tests. Prefer named fixtures over broad autouse fixtures when
+  only a subset of tests needs isolation.
+- Do not let tests require live Anthropic access by default. Live model tests
+  must be gated behind `RUN_LIVE_ANTHROPIC_TESTS=1` and `ANTHROPIC_API_KEY`.
+
+## Dependency Boundaries
+
+- Unit tests should mock network, database, and model-client seams unless they
+  are explicitly marked as live/integration tests.
+- Tests that depend on `policyengine_uk_compiled` should skip cleanly when that
+  package is not installed locally, while CI should install backend
+  dependencies before running the full backend suite.
+- Conversation-table tests should use the named isolated table fixture rather
+  than a shared developer database.
+
+## Commands
+
+Before handing off backend changes, run the focused backend tests that cover the
+changed code. For broader verification, use:
+
+```bash
+make test-backend
+```
+
+Before handing off frontend changes, run:
+
+```bash
+make test-frontend
+```
+
+For changes spanning both sides, run:
+
+```bash
+make test
+```
+
+If a command cannot run locally because dependencies or credentials are missing,
+state that explicitly in the handoff.
diff --git a/docs/engineering/skills/uk-chat-runtime.md b/docs/engineering/skills/uk-chat-runtime.md
new file mode 100644
index 0000000..ceeb9a9
--- /dev/null
+++ b/docs/engineering/skills/uk-chat-runtime.md
@@ -0,0 +1,67 @@
+# UK Chat Runtime
+
+Use this skill when changing the UK chat model pathway, system prompts, exposed
+tools, calculation behavior, or AI-facing runtime boundaries.
+
+## Source Boundaries
+
+- `backend/prompts.py` owns product runtime prompt text. Keep prompts modular and
+  declarative there.
+- `backend/routes/chatbot.py` owns application orchestration: request parsing,
+  system block assembly, model calls, SSE streaming, tool-loop handling,
+  usage/billing, title generation, and follow-up suggestions.
+- `backend/agent_tools.py` owns deterministic tool implementations and model
+  tool schemas.
+- `backend/scripts/build_reference.py` builds the API reference that is attached
+  to the chat system prompt.
+
+Do not spread prompt strings back into route handlers. If runtime prompt rules
+change, update `backend/prompts.py` and the prompt contract tests together.
+
+## Model Harness
+
+The current chat runtime is application-specific rather than a generic model
+harness. It calls the Anthropic SDK directly for streaming control. Pydantic AI
+imports/comments may exist, but they should not be treated as the active
+orchestration layer unless the code is deliberately refactored.
+
+Keep model/provider-specific code at the orchestration edge. Durable guidance
+for agents belongs in `docs/engineering/skills/`; product behavior prompts
+belong in `backend/prompts.py`.
+
+## Tool Boundary
+
+Only tools listed in `TOOL_DEFINITIONS` and dispatched by `execute_tool()` are
+exposed to the model. At present, the exposed tools are:
+
+- `run_python`: execute reproducible PolicyEngine UK Python code.
+- `generate_chart`: return frontend-renderable chart JSON markdown.
+
+Helper functions in `backend/agent_tools.py` are implementation details unless
+they are added to both the tool definitions and dispatcher.
+
+## Deterministic And Non-Deterministic Segments
+
+- Non-deterministic: user text interpretation, model planning, tool selection,
+  prose generation, follow-up suggestions, and title generation.
+- Deterministic: request validation, plan-mode tool omission, tool dispatch,
+  Python execution, chart JSON construction, result truncation/summarisation,
+  billing calculation, and database writes.
+
+Plan mode must remain structurally enforced by omitting tools from the model
+request, not only by prompting the model not to call tools.
+
+## Policy Analysis Rules
+
+- Be factually neutral. Do not call UK tax or benefit choices good, bad, fair,
+  unfair, regressive, progressive, generous, punitive, or similar.
+- Quantitative policy answers should be computed with `run_python`; do not
+  answer tax, benefit, reform, poverty, decile, or distributional questions from
+  memory.
+- Do not access, display, quote, or imply access to row-level survey microdata
+  or real households.
+- Use aggregate microdata interfaces only for aggregate outputs.
+- If a user asks for household examples, construct illustrative synthetic
+  households with the public `Simulation` API. Prefer
+  `Simulation.single_person()` for single-person examples, and label examples as
+  illustrative, synthetic, or hypothetical.