PolicyEngine · SakshiKekre · May 27, 2026
diff --git a/backend/agent_tools.py b/backend/agent_tools.py
@@ -728,6 +728,9 @@ def execute_tool(
         "run_python": run_python,
         "compute": compute,
         "generate_chart": generate_chart,
+        "calculate_household": calculate_household,
+        "run_economy_simulation": run_economy_simulation,
+        "analyse_microdata": analyse_microdata,
     }
     if tool_name not in tools:
         return {"error": f"Unknown tool: {tool_name}"}
@@ -747,12 +750,156 @@ def execute_tool(
         return {"error": str(e)}
 
 
+# Shared reform-input schema for the typed tools. Top-level keys are the UK
+# programme names accepted by _build_compiled_policy(); values within each
+# programme match the corresponding *Params constructor (e.g. IncomeTaxParams).
+# `additionalProperties: True` keeps the schema permissive — _build_compiled_policy
+# enforces the real validation and raises a clean error listing valid fields.
+REFORM_SCHEMA = {
+    "type": "object",
+    "description": (
+        "Parametric reform. Top-level keys are programmes; values are the "
+        "parameter changes for that programme. Valid programmes: income_tax, "
+        "national_insurance, universal_credit, child_benefit, state_pension, "
+        "pension_credit, benefit_cap, housing_benefit, tax_credits, "
+        "scottish_child_payment, stamp_duty, capital_gains_tax, wealth_tax. "
+        "Field names within each programme match the corresponding *Params "
+        "constructor (e.g. income_tax.personal_allowance, "
+        "national_insurance.main_rate). For structural reforms (new policies "
+        "or replacing existing ones), use run_python instead."
+    ),
+    "additionalProperties": True,
+}
+
+
 def get_tool_definitions(backend_id: str = "uk_compiled") -> List[Dict[str, Any]]:
     backend = get_backend(backend_id)
     return [
+        {
+            "name": "calculate_household",
+            "description": (
+                "Compute taxes, benefits, and net income for a SPECIFIC household "
+                "you describe (person/benunit/household records). Prefer this over "
+                "run_python for any household-level question with a defined "
+                "household composition. Returns full per-person, per-benunit, and "
+                "per-household results including baseline and reform columns."
+            ),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "person": {
+                        "type": "array",
+                        "items": {"type": "object"},
+                        "description": (
+                            "List of person records. Each must include person_id, "
+                            "benunit_id, household_id, age. Common optional fields: "
+                            "employment_income, self_employment_income, pension_income."
+                        ),
+                    },
+                    "benunit": {
+                        "type": "array",
+                        "items": {"type": "object"},
+                        "description": (
+                            "List of benefit-unit records, each with benunit_id "
+                            "and household_id."
+                        ),
+                    },
+                    "household": {
+                        "type": "array",
+                        "items": {"type": "object"},
+                        "description": (
+                            "List of household records, each with household_id "
+                            "and region (e.g. 'ENGLAND', 'SCOTLAND', 'WALES', "
+                            "'NORTHERN_IRELAND')."
+                        ),
+                    },
+                    "year": {"type": "integer", "default": 2025},
+                    "reform": REFORM_SCHEMA,
+                },
+                "required": ["person", "benunit", "household"],
+            },
+        },
+        {
+            "name": "run_economy_simulation",
+            "description": (
+                "Run a UK economy-wide microsimulation comparing baseline "
+                "(current law) to a parametric reform. Returns budgetary impact, "
+                "programme breakdown, decile impacts, winners/losers, caseloads, "
+                "HBAI incomes, and poverty metrics — all methodology-pinned to "
+                "PolicyEngine canonical definitions (BHC poverty, OECD-modified "
+                "equivalisation, survey-weighted). Prefer this over run_python "
+                "for any society-wide reform analysis. Does NOT support structural "
+                "reforms — for those, use run_python."
+            ),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "year": {"type": "integer", "default": 2025},
+                    "reform": REFORM_SCHEMA,
+                    "dataset": {
+                        "type": "string",
+                        "enum": ["frs", "efrs", "spi", "lcfs", "was"],
+                        "default": "frs",
+                        "description": (
+                            "Microdata source. frs = Family Resources Survey "
+                            "(default), efrs = Enhanced FRS (matches PE-API)."
+                        ),
+                    },
+                },
+                "required": [],
+            },
+        },
+        {
+            "name": "analyse_microdata",
+            "description": (
+                "Slice, filter, sample, or aggregate the cached microdata for a "
+                "given year + reform. Use this for 'show me N households like X', "
+                "'break the result down by Y', 'mean income for subset Z'. "
+                "Operates on cached microdata, so it's fast for follow-up "
+                "questions on the same reform."
+            ),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "entity": {
+                        "type": "string",
+                        "enum": ["persons", "benunits", "households"],
+                    },
+                    "operation": {
+                        "type": "string",
+                        "enum": ["sample", "mean", "sum", "count", "group_by", "describe"],
+                    },
+                    "year": {"type": "integer", "default": 2025},
+                    "reform": REFORM_SCHEMA,
+                    "filters": {
+                        "type": "object",
+                        "description": (
+                            "Column → predicate. Predicate can be a scalar (=), "
+                            "a list (in), or a dict with keys "
+                            "min/max/gt/lt/gte/lte/ne."
+                        ),
+                    },
+                    "columns": {"type": "array", "items": {"type": "string"}},
+                    "group_by": {"type": "array", "items": {"type": "string"}},
+                    "n": {
+                        "type": "integer",
+                        "default": 5,
+                        "description": "Sample size when operation=sample.",
+                    },
+                    "dataset": {"type": "string", "default": "frs"},
+                },
+                "required": ["entity", "operation"],
+            },
+        },
         {
             "name": "run_python",
-            "description": backend.tool_description(),
+            "description": backend.tool_description() + (
+                "\n\nFALLBACK: prefer the typed tools (calculate_household, "
+                "run_economy_simulation, analyse_microdata) when the question fits "
+                "their shape. Use run_python for questions those tools can't "
+                "express — structural reforms, novel aggregations, parameter "
+                "introspection, historical lookups, etc."
+            ),
             "input_schema": {
                 "type": "object",
                 "properties": {

diff --git a/backend/routes/chatbot.py b/backend/routes/chatbot.py
@@ -29,16 +29,19 @@
 # ---------------------------------------------------------------------------
 SYSTEM_PROMPT_TEMPLATE = """You are an expert policy analysis assistant for a microsimulation platform. You help users understand and analyse tax and benefit policy using reproducible Python code.
 
-CRITICAL - ALWAYS COMPUTE WITH PYTHON:
+CRITICAL - ALWAYS COMPUTE; NEVER ANSWER FROM MEMORY:
 - Never answer quantitative policy questions from memory.
-- You have one execution tool: `run_python`.
-- Use `run_python` for every tax, benefit, reform, schedule, poverty, decile, and distributional question.
-- Every number in your answer must come directly from the Python result you just computed.
-
-CRITICAL - START BY READING THE MODEL INSTRUCTIONS:
-- At the start of a new line of analysis, use Python to inspect `capabilities()`.
-- Use that to ground yourself in the available datasets, years, programmes, and caveats before you simulate.
-- If the user asks about something outside the modelled scope, say so clearly instead of guessing.
+- Every number in your answer must come directly from a tool call you just made.
+
+You have four execution tools, ordered from most specific to most general:
+- `calculate_household` — for any question about a specific household you can describe (person/benunit/household composition with incomes/ages/region).
+- `run_economy_simulation` — for society-wide reform analysis. Methodology is pinned (BHC poverty, OECD-modified equivalisation, FRS dataset by default). Parametric reforms only.
+- `analyse_microdata` — for slicing, filtering, sampling, or aggregating across the population for a given reform.
+- `run_python` — fallback for anything the typed tools can't express (structural reforms, novel aggregations, parameter history lookups, etc.).
+
+Prefer the typed tools first; reach for run_python only when no typed tool fits. For the typed tools, the JSON schema tells you what's allowed — call them directly. For run_python, inspect `capabilities()` first to ground yourself in available datasets, years, programmes, and caveats.
+
+If the user asks about something outside the modelled scope, say so clearly instead of guessing.
 
 {backend_prompt_context}
 

diff --git a/frontend/src/app/ChatPage.tsx b/frontend/src/app/ChatPage.tsx
@@ -719,7 +719,12 @@ export default function ChatPage() {
         >
           {t.status === "pending" && <Loader size={10} color={THEME.primary} />}
           {hasDetails && <IconChevronDown size={10} style={{ opacity: 0.4, transform: isExpanded ? "none" : "rotate(-90deg)", transition: "transform 0.15s" }} />}
-          <span style={{ color: THEME.text3 }}>{t.tool_name === "run_python" ? "python" : t.tool_name}</span>
+          <span style={{ color: THEME.text3 }}>{({
+            run_python: "python",
+            calculate_household: "household sim",
+            run_economy_simulation: "economy sim",
+            analyse_microdata: "microdata analysis",
+          } as Record<string, string>)[t.tool_name] ?? t.tool_name}</span>
           {t.status !== "pending" && <span style={{ color: THEME.muted }}>✓</span>}
         </div>
         {isExpanded && hasDetails && renderToolDetails(t)}