From 9fe09944797ce709c6e1044ff163ea82967513d7 Mon Sep 17 00:00:00 2001
From: Beandon13 <bzarnitz23@gmail.com>
Date: Tue, 5 May 2026 14:53:24 -0400
Subject: [PATCH] fix(anthropic): warn when stop_reason indicates max_tokens
 truncation

The Anthropic Message API returns stop_reason='max_tokens' when output is
cut off before generation finished, but AnthropicCompletion previously
discarded the field after extracting token usage. Silent truncation in
the final synthesis response (after tool use) is especially harmful
because partial JSON / answers are passed downstream as "complete".

Surface the signal as a logging.warning at all four completion sites
(sync/async, plain completion + tool-use follow-up). The warning
includes the agent role and current max_tokens for actionable triage.

Closes #5148
---
 .../llms/providers/anthropic/completion.py    |  52 +++++++++
 .../tests/llms/anthropic/test_anthropic.py    | 102 ++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
index b627a85394..5d48073653 100644
--- a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
@@ -124,6 +124,44 @@ def _contains_file_id_reference(messages: list[dict[str, Any]]) -> bool:
     return False
 
 
+def _warn_on_max_tokens_truncation(
+    response: Any,
+    max_tokens: int,
+    from_agent: Any | None = None,
+    *,
+    context: str = "",
+) -> None:
+    """Emit a warning if the Anthropic response was truncated by ``max_tokens``.
+
+    Anthropic returns a ``stop_reason`` field on every Message response
+    indicating why generation stopped. ``"max_tokens"`` means the output was
+    cut off before the model finished, which can corrupt downstream parsing
+    (especially for the final synthesis response after tool use).
+
+    The Anthropic SDK exposes this via ``response.stop_reason``. We surface
+    the signal as a ``logging.warning`` so users can detect truncation
+    without wiring up event subscribers.
+
+    Args:
+        response: The Anthropic Message-like response object.
+        max_tokens: The configured max_tokens value (for the actionable hint).
+        from_agent: Optional agent reference for log context.
+        context: Optional short label (e.g. ``"tool conversation"``) included
+            in the log message to disambiguate truncation sites.
+    """
+    if response is None:
+        return
+    if getattr(response, "stop_reason", None) != "max_tokens":
+        return
+    role = getattr(from_agent, "role", None)
+    agent_hint = f" [{role}]" if role else ""
+    location = f" ({context})" if context else ""
+    logging.warning(
+        f"Truncated response{agent_hint}{location}: stop_reason='max_tokens'. "
+        f"Consider increasing max_tokens (current: {max_tokens})."
+    )
+
+
 class AnthropicThinkingConfig(BaseModel):
     type: Literal["enabled", "disabled"]
     budget_tokens: int | None = None
@@ -844,6 +882,7 @@ def _handle_completion(
 
         usage = self._extract_anthropic_token_usage(response)
         self._track_token_usage_internal(usage)
+        _warn_on_max_tokens_truncation(response, self.max_tokens, from_agent)
 
         if _is_pydantic_model_class(response_model) and response.content:
             if use_native_structured_output:
@@ -1272,6 +1311,12 @@ def _handle_tool_use_conversation(
             # Track token usage for follow-up call
             follow_up_usage = self._extract_anthropic_token_usage(final_response)
             self._track_token_usage_internal(follow_up_usage)
+            _warn_on_max_tokens_truncation(
+                final_response,
+                self.max_tokens,
+                from_agent,
+                context="tool conversation",
+            )
 
             final_content = ""
             thinking_blocks: list[ThinkingBlock] = []
@@ -1377,6 +1422,7 @@ async def _ahandle_completion(
 
         usage = self._extract_anthropic_token_usage(response)
         self._track_token_usage_internal(usage)
+        _warn_on_max_tokens_truncation(response, self.max_tokens, from_agent)
 
         if _is_pydantic_model_class(response_model) and response.content:
             if use_native_structured_output:
@@ -1676,6 +1722,12 @@ async def _ahandle_tool_use_conversation(
 
             follow_up_usage = self._extract_anthropic_token_usage(final_response)
             self._track_token_usage_internal(follow_up_usage)
+            _warn_on_max_tokens_truncation(
+                final_response,
+                self.max_tokens,
+                from_agent,
+                context="tool conversation",
+            )
 
             final_content = ""
             if final_response.content:
diff --git a/lib/crewai/tests/llms/anthropic/test_anthropic.py b/lib/crewai/tests/llms/anthropic/test_anthropic.py
index 81a51c8d6f..0c8cc97864 100644
--- a/lib/crewai/tests/llms/anthropic/test_anthropic.py
+++ b/lib/crewai/tests/llms/anthropic/test_anthropic.py
@@ -1505,3 +1505,105 @@ def test_anthropic_missing_cache_fields_default_to_zero():
     usage = llm._extract_anthropic_token_usage(mock_response)
     assert usage["cached_prompt_tokens"] == 0
     assert usage["cache_creation_tokens"] == 0
+
+
+def test_warn_on_max_tokens_truncation_emits_warning(caplog):
+    """The helper warns when stop_reason == 'max_tokens' (issue #5148)."""
+    import logging as stdlib_logging
+
+    from crewai.llms.providers.anthropic.completion import (
+        _warn_on_max_tokens_truncation,
+    )
+
+    response = MagicMock()
+    response.stop_reason = "max_tokens"
+
+    with caplog.at_level(stdlib_logging.WARNING):
+        _warn_on_max_tokens_truncation(response, max_tokens=4096)
+
+    assert any(
+        rec.levelno == stdlib_logging.WARNING
+        and "max_tokens" in rec.getMessage()
+        and "Truncated response" in rec.getMessage()
+        for rec in caplog.records
+    )
+
+
+def test_warn_on_max_tokens_truncation_silent_for_normal_stop(caplog):
+    """The helper stays silent when generation finished normally (issue #5148)."""
+    import logging as stdlib_logging
+
+    from crewai.llms.providers.anthropic.completion import (
+        _warn_on_max_tokens_truncation,
+    )
+
+    response = MagicMock()
+    response.stop_reason = "end_turn"
+
+    with caplog.at_level(stdlib_logging.WARNING):
+        _warn_on_max_tokens_truncation(response, max_tokens=4096)
+
+    assert not any(
+        rec.levelno == stdlib_logging.WARNING
+        and "Truncated response" in rec.getMessage()
+        for rec in caplog.records
+    )
+
+
+def test_warn_on_max_tokens_truncation_includes_agent_role(caplog):
+    """The helper surfaces the agent role when provided (issue #5148)."""
+    import logging as stdlib_logging
+
+    from crewai.llms.providers.anthropic.completion import (
+        _warn_on_max_tokens_truncation,
+    )
+
+    response = MagicMock()
+    response.stop_reason = "max_tokens"
+
+    fake_agent = MagicMock()
+    fake_agent.role = "Senior Researcher"
+
+    with caplog.at_level(stdlib_logging.WARNING):
+        _warn_on_max_tokens_truncation(
+            response, max_tokens=2048, from_agent=fake_agent, context="tool conversation"
+        )
+
+    matching = [
+        rec.getMessage()
+        for rec in caplog.records
+        if rec.levelno == stdlib_logging.WARNING and "Truncated response" in rec.getMessage()
+    ]
+    assert matching, "expected a truncation warning"
+    msg = matching[0]
+    assert "Senior Researcher" in msg
+    assert "tool conversation" in msg
+    assert "2048" in msg
+
+
+def test_handle_completion_logs_truncation_warning(caplog):
+    """`_handle_completion` surfaces stop_reason='max_tokens' as a warning."""
+    import logging as stdlib_logging
+
+    llm = LLM(model="anthropic/claude-3-5-sonnet-20241022")
+
+    mock_response = MagicMock()
+    text_block = MagicMock()
+    text_block.text = "Truncated answer"
+    text_block.__class__ = type(text_block)
+    mock_response.content = [text_block]
+    mock_response.usage = MagicMock(input_tokens=10, output_tokens=5)
+    mock_response.usage.cache_read_input_tokens = 0
+    mock_response.usage.cache_creation_input_tokens = 0
+    mock_response.stop_reason = "max_tokens"
+
+    with patch.object(llm._client.messages, "create", return_value=mock_response):
+        with caplog.at_level(stdlib_logging.WARNING):
+            llm.call("Hello")
+
+    assert any(
+        rec.levelno == stdlib_logging.WARNING
+        and "max_tokens" in rec.getMessage()
+        and "Truncated response" in rec.getMessage()
+        for rec in caplog.records
+    )