diff --git a/backend/routes/chatbot.py b/backend/routes/chatbot.py index 7bd96e2..2685f33 100644 --- a/backend/routes/chatbot.py +++ b/backend/routes/chatbot.py @@ -102,6 +102,17 @@ from pathlib import Path import anthropic as anthropic_sdk +# Soft cap on tool-use iterations within a single /chat/message stream. +# An "iteration" is one round-trip to Anthropic that may include tool calls. +# We lowered this from 60 to keep runaway agents from hanging the Vercel +# proxy (which times out the SSE connection and surfaces as "Failed to fetch" +# in the browser). When hit, we emit a user-facing fallback message and a +# `done` event with stop_reason="iteration_cap" instead of cutting off mid-stream. +# NOTE: this is per-request. The /chat/message "continue" flow re-enters this +# loop with a fresh budget, but the prior tool transcript is already in the +# conversation so the model resumes mid-thought rather than restarting. +MAX_ITERATIONS = 12 + DEFAULT_FAST_MODEL = os.environ.get("ANTHROPIC_FAST_MODEL", "claude-haiku-4-5") DEFAULT_COMPLEX_MODEL = os.environ.get("ANTHROPIC_COMPLEX_MODEL", "claude-sonnet-4-6") TITLE_MODEL = os.environ.get("ANTHROPIC_TITLE_MODEL", DEFAULT_FAST_MODEL) @@ -418,12 +429,17 @@ async def generate_stream(): try: conversation = deduplicated.copy() iteration = 0 - max_iterations = 60 + max_iterations = MAX_ITERATIONS total_input_tokens = 0 total_output_tokens = 0 total_cache_read_input_tokens = 0 total_cache_creation_input_tokens = 0 recent_tool_calls: List[str] = [] + # Track every tool call across the turn so the cap-hit fallback can + # tell the user what was tried. Counts only (no inputs) keeps PII out + # of the summary and keeps it well under the 300-char target. + tool_call_counts: Dict[str, int] = {} + last_tool_error: str | None = None client = _get_anthropic_client() model = _select_chat_model(conversation) @@ -529,6 +545,10 @@ async def generate_stream(): yield f"data: {json.dumps({'type': 'thinking_done'})}\n\n" if not tool_uses: + logger.info( + f"[CHAT] Session {session_id}: converged at {iteration} iterations" + f" stop_reason={last_stop_reason}" + ) # Record token usage for billing billing = None try: @@ -601,6 +621,15 @@ async def execute_tool_async(tu): if await request.is_disconnected(): return completed_tools[tu["id"]] = result + tool_call_counts[tu["name"]] = tool_call_counts.get(tu["name"], 0) + 1 + # Capture the most recent tool error so the cap-hit fallback + # can hint at what the agent was struggling with. + if isinstance(result, dict): + err = result.get("error") or result.get("stderr") + if err: + err_str = str(err).strip().splitlines()[-1] if str(err).strip() else "" + if err_str: + last_tool_error = err_str[:120] result_str = _serialise_tool_result(result) result_summary = result_str[:5000] + "..." if len(result_str) > 5000 else result_str yield f"data: {json.dumps({'type': 'tool_result', 'tool_name': tu['name'], 'tool_id': tu['id'], 'status': 'success', 'result_summary': result_summary})}\n\n" @@ -629,6 +658,32 @@ async def execute_tool_async(tu): conversation.append({"role": "user", "content": tool_results}) if iteration >= max_iterations: + logger.info( + f"[CHAT] Session {session_id}: iteration cap hit at {iteration} iterations" + f" — tool_counts={tool_call_counts}" + f"{f' last_error={last_tool_error!r}' if last_tool_error else ''}" + ) + # Build a short summary of what was tried. Kept under ~300 chars + # so it reads as a sentence, not a transcript. + if tool_call_counts: + parts = [f"`{name}` {count}×" for name, count in tool_call_counts.items()] + tried_clause = "ran " + ", ".join(parts) + else: + tried_clause = "didn't complete any tool calls" + error_clause = ( + f", last attempt errored with \"{last_tool_error}\"" + if last_tool_error else "" + ) + fallback_message = ( + "\n\nI'm spending more iterations than expected on this without converging. " + f"Here's what I tried: {tried_clause}{error_clause}. " + "Could you (a) rephrase the question, (b) enable Plan mode so I can ask " + "clarifying questions first, or (c) try a more specific scenario?" + ) + # Hard cap defensively in case tool names balloon the string. + if len(fallback_message) > 600: + fallback_message = fallback_message[:597] + "..." + billing = None try: from routes.billing import record_usage @@ -643,8 +698,9 @@ async def execute_tool_async(tu): ) except Exception as e: logger.warning(f"[CHAT] Failed to record usage: {e}") - yield f"data: {json.dumps({'type': 'chunk', 'content': '\\n\\n*[Reached maximum iterations]*'})}\n\n" - yield f"data: {json.dumps({'type': 'done', 'content': assistant_content, 'session_id': session_id, 'model': model, 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n" + yield f"data: {json.dumps({'type': 'chunk', 'content': fallback_message})}\n\n" + final_content = assistant_content + fallback_message + yield f"data: {json.dumps({'type': 'done', 'content': final_content, 'session_id': session_id, 'model': model, 'stop_reason': 'iteration_cap', 'usage': {'input_tokens': total_input_tokens, 'output_tokens': total_output_tokens, 'cache_creation_input_tokens': total_cache_creation_input_tokens, 'cache_read_input_tokens': total_cache_read_input_tokens}, 'cost_gbp': billing['cost_gbp'] if billing else None, 'balance': billing['balance'] if billing else None})}\n\n" except Exception as e: import traceback