From daedfc2dac7636ced810a8ad70bac123a216afb8 Mon Sep 17 00:00:00 2001
From: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf
 <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co>
Date: Sat, 13 Jun 2026 15:30:12 -0400
Subject: [PATCH] Fix handoff summary history budget

Co-authored-by: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co>
Signed-off-by: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co>
---
 crates/buzz-agent/src/config.rs        |   4 -
 crates/buzz-agent/src/handoff.rs       |  98 +++++++++++++------
 crates/buzz-agent/tests/regressions.rs | 126 +++++++++++++++++++++++++
 3 files changed, 195 insertions(+), 33 deletions(-)
diff --git a/crates/buzz-agent/src/config.rs b/crates/buzz-agent/src/config.rs
index b8ed1a099..1d5a3df3b 100644
--- a/crates/buzz-agent/src/config.rs
+++ b/crates/buzz-agent/src/config.rs
@@ -18,12 +18,8 @@ pub const MAX_TOOL_CALLS_PER_TURN: usize = 64;
 
 pub const HANDOFF_MAX_OUTPUT_TOKENS: u32 = 8192;
 
-pub const HANDOFF_TAIL_ITEMS: usize = 5;
-
 pub const HANDOFF_ORIGINAL_TASK_MAX_BYTES: usize = 16 * 1024;
 
-pub const HANDOFF_PROMPT_MAX_BYTES: usize = 32 * 1024;
-
 pub const HANDOFF_MAX_TOOL_NAMES: usize = 20;
 
 const DEFAULT_SYSTEM_PROMPT: &str =
diff --git a/crates/buzz-agent/src/handoff.rs b/crates/buzz-agent/src/handoff.rs
index 619586ac7..65b733538 100644
--- a/crates/buzz-agent/src/handoff.rs
+++ b/crates/buzz-agent/src/handoff.rs
@@ -1,7 +1,6 @@
 use crate::agent::RunCtx;
 use crate::config::{
     HANDOFF_MAX_OUTPUT_TOKENS, HANDOFF_MAX_TOOL_NAMES, HANDOFF_ORIGINAL_TASK_MAX_BYTES,
-    HANDOFF_PROMPT_MAX_BYTES, HANDOFF_TAIL_ITEMS,
 };
 use crate::types::HistoryItem;
 
@@ -16,8 +15,6 @@ turn of an autonomous agent. Be concise but thorough. Cover: what the original t
 you accomplished, key decisions made, what remains, and one concrete next step. Output plain \
 text only — no tool calls, no JSON. Stay under 8192 tokens.";
 
-const HANDOFF_SNIPPET_BYTES: usize = 2048;
-
 impl RunCtx<'_> {
     pub(crate) async fn maybe_handoff(&mut self) -> HandoffOutcome {
         if !self.should_handoff() {
@@ -160,32 +157,46 @@ impl RunCtx<'_> {
              Produce a context handoff summary covering: (1) original task, \
              (2) what was accomplished, (3) key decisions, (4) what remains, \
              (5) one concrete next step. Be concise but thorough. Plain text.\n";
-        let history_header = "\n# Recent History (most recent last)\n";
-
-        let start = self.history.len().saturating_sub(HANDOFF_TAIL_ITEMS);
-        let mut snippets: Vec<String> = self.history[start..]
-            .iter()
-            .map(|item| {
-                let mut s = String::new();
-                push_history_snippet(&mut s, item);
-                s
-            })
-            .collect();
+        let history_header = "\n# Session History (oldest first)\n";
+        let prompt_budget = handoff_prompt_budget_bytes(
+            self.cfg.max_context_tokens,
+            HANDOFF_MAX_OUTPUT_TOKENS,
+            head.len() + history_header.len() + tail.len(),
+        );
 
-        let fixed = head.len() + history_header.len() + tail.len();
-        let mut snippets_bytes: usize = snippets.iter().map(String::len).sum();
+        let mut snippets: Vec<String> = Vec::new();
+        let mut snippets_bytes = 0usize;
         let mut dropped = 0usize;
-        while fixed + snippets_bytes > HANDOFF_PROMPT_MAX_BYTES && !snippets.is_empty() {
-            let removed = snippets.remove(0);
-            snippets_bytes -= removed.len();
-            dropped += 1;
+        for item in self.history.iter().rev() {
+            let mut snippet = String::new();
+            push_history_snippet(&mut snippet, item);
+            let snippet_bytes = snippet.len();
+            if snippets_bytes.saturating_add(snippet_bytes) > prompt_budget {
+                if snippets.is_empty() {
+                    snippets.push(clamp_bytes(&snippet, prompt_budget));
+                    snippets_bytes = prompt_budget;
+                }
+                dropped += 1;
+                continue;
+            }
+            snippets_bytes += snippet_bytes;
+            snippets.push(snippet);
         }
+        snippets.reverse();
         if dropped > 0 {
-            tracing::info!("handoff prompt cap, dropped {dropped} oldest snippets");
+            tracing::info!(
+                "handoff prompt budget, dropped {dropped} oldest snippets; kept {} bytes",
+                snippets_bytes
+            );
         }
 
-        let mut out =
-            String::with_capacity(fixed + snippets_bytes + if dropped > 0 { 32 } else { 0 });
+        let mut out = String::with_capacity(
+            head.len()
+                + history_header.len()
+                + tail.len()
+                + snippets_bytes
+                + if dropped > 0 { 32 } else { 0 },
+        );
         out.push_str(&head);
         out.push_str(history_header);
         if dropped > 0 {
@@ -211,13 +222,13 @@ fn push_history_snippet(out: &mut String, item: &HistoryItem) {
     match item {
         HistoryItem::User(s) => {
             out.push_str("[user] ");
-            out.push_str(&clamp_for_snippet(s));
+            out.push_str(s);
             out.push('\n');
         }
         HistoryItem::Assistant { text, tool_calls } => {
             out.push_str("[assistant] ");
             if !text.is_empty() {
-                out.push_str(&clamp_for_snippet(text));
+                out.push_str(text);
             }
             for c in tool_calls {
                 out.push_str(&format!(" tool:{}", c.name));
@@ -226,14 +237,30 @@ fn push_history_snippet(out: &mut String, item: &HistoryItem) {
         }
         HistoryItem::ToolResult(r) => {
             out.push_str(if r.is_error { "[tool_err] " } else { "[tool] " });
-            out.push_str(&clamp_for_snippet(&r.text()));
+            out.push_str(&r.text());
             out.push('\n');
         }
     }
 }
 
-fn clamp_for_snippet(s: &str) -> String {
-    clamp_bytes(s, HANDOFF_SNIPPET_BYTES)
+/// Byte budget for session-history text inside the handoff prompt. The
+/// summarizer uses the same provider/model config as normal completion, so
+/// derive the input budget from the model context window instead of applying a
+/// separate fixed prompt cap. We keep the same 1 byte/token upper-bound
+/// estimate used by the handoff gate, which is conservative: it may drop old
+/// history early for unusually large sessions, but it should not build a prompt
+/// that exceeds the configured context window.
+fn handoff_prompt_budget_bytes(
+    max_context_tokens: u64,
+    max_output_tokens: u32,
+    fixed_prompt_bytes: usize,
+) -> usize {
+    max_context_tokens
+        .saturating_sub(u64::from(max_output_tokens))
+        .saturating_mul(CONSERVATIVE_BYTES_PER_TOKEN)
+        .saturating_sub(u64::try_from(fixed_prompt_bytes).unwrap_or(u64::MAX))
+        .try_into()
+        .unwrap_or(usize::MAX)
 }
 
 pub(crate) fn clamp_bytes(s: &str, max_bytes: usize) -> String {
@@ -299,7 +326,20 @@ fn byte_fallback_threshold(
 
 #[cfg(test)]
 mod tests {
-    use super::{byte_fallback_threshold, estimate_tokens_from_bytes, token_threshold};
+    use super::{
+        byte_fallback_threshold, estimate_tokens_from_bytes, handoff_prompt_budget_bytes,
+        token_threshold,
+    };
+
+    #[test]
+    fn handoff_prompt_budget_reserves_summary_output_and_fixed_prompt() {
+        assert_eq!(handoff_prompt_budget_bytes(25_000, 8_192, 1_000), 15_808);
+    }
+
+    #[test]
+    fn handoff_prompt_budget_saturates_when_fixed_prompt_exceeds_window() {
+        assert_eq!(handoff_prompt_budget_bytes(1_000, 2_000, 10_000), 0);
+    }
 
     #[test]
     fn token_threshold_uses_fraction_when_output_is_small() {
diff --git a/crates/buzz-agent/tests/regressions.rs b/crates/buzz-agent/tests/regressions.rs
index 3fdaa9c59..414eccef5 100644
--- a/crates/buzz-agent/tests/regressions.rs
+++ b/crates/buzz-agent/tests/regressions.rs
@@ -1132,6 +1132,132 @@ async fn hook_post_compact_injects_after_handoff() {
     h.shutdown().await;
 }
 
+/// The handoff summary prompt should include all session history when that
+/// history fits the summarizer context budget. This protects against regressing
+/// to the old fixed tail of five tiny snippets.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn handoff_summary_prompt_includes_full_history_within_context_budget() {
+    let llm = spawn_capturing_llm(vec![
+        openai_text_with_usage("ack-0", 9500),
+        openai_text("handoff summary text"),
+        openai_text_with_usage("done", 10),
+    ])
+    .await;
+    let mut h = Harness::spawn_with_env(
+        &llm.url,
+        &[
+            ("BUZZ_AGENT_MAX_CONTEXT_TOKENS", "10000"),
+            ("BUZZ_AGENT_MAX_OUTPUT_TOKENS", "1000"),
+            ("BUZZ_AGENT_MAX_HANDOFFS", "3"),
+            (
+                "BUZZ_AGENT_MAX_HISTORY_BYTES",
+                &(16 * 1024 * 1024).to_string(),
+            ),
+        ],
+    )
+    .await;
+    let sid = init_session(&mut h, json!([])).await;
+
+    let p0 = h
+        .send(
+            "session/prompt",
+            json!({"sessionId": sid, "prompt": [{"type":"text","text":"early-history-marker"}]}),
+        )
+        .await;
+    let _ = h.recv_until(|v| v["id"] == json!(p0)).await;
+
+    let p1 = h
+        .send(
+            "session/prompt",
+            json!({"sessionId": sid, "prompt": [{"type":"text","text":"late-history-marker"}]}),
+        )
+        .await;
+    let _ = h.recv_until(|v| v["id"] == json!(p1)).await;
+
+    let captured = llm.captured.lock().await;
+    assert_eq!(captured.len(), 3, "expected prompt, handoff, prompt");
+    let handoff_messages = captured[1]["messages"].as_array().unwrap();
+    let handoff_prompt = handoff_messages[1]["content"].as_str().unwrap();
+    assert!(
+        handoff_prompt.contains("# Session History (oldest first)"),
+        "handoff prompt should describe full session history: {handoff_prompt}"
+    );
+    assert!(
+        handoff_prompt.contains("early-history-marker"),
+        "oldest prompt was omitted despite fitting budget: {handoff_prompt}"
+    );
+    assert!(
+        handoff_prompt.contains("ack-0"),
+        "assistant response was omitted despite fitting budget: {handoff_prompt}"
+    );
+    assert!(
+        handoff_prompt.contains("late-history-marker"),
+        "latest prompt was omitted despite fitting budget: {handoff_prompt}"
+    );
+    assert!(
+        !handoff_prompt.contains("older items omitted"),
+        "handoff should not report truncation when full history fits: {handoff_prompt}"
+    );
+    h.shutdown().await;
+}
+
+/// If one item is larger than the derived summarizer budget, keep a truncated
+/// form of the most recent item instead of sending an empty history block.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn handoff_summary_prompt_keeps_latest_item_when_one_item_exceeds_budget() {
+    let llm = spawn_capturing_llm(vec![
+        openai_text_with_usage("ack-0", 9500),
+        openai_text("handoff summary text"),
+        openai_text_with_usage("done", 10),
+    ])
+    .await;
+    let mut h = Harness::spawn_with_env(
+        &llm.url,
+        &[
+            ("BUZZ_AGENT_MAX_CONTEXT_TOKENS", "10000"),
+            ("BUZZ_AGENT_MAX_OUTPUT_TOKENS", "1000"),
+            ("BUZZ_AGENT_MAX_HANDOFFS", "3"),
+            (
+                "BUZZ_AGENT_MAX_HISTORY_BYTES",
+                &(16 * 1024 * 1024).to_string(),
+            ),
+        ],
+    )
+    .await;
+    let sid = init_session(&mut h, json!([])).await;
+
+    let huge = format!("oversize-latest-marker {}", "x".repeat(12000));
+    let p0 = h
+        .send(
+            "session/prompt",
+            json!({"sessionId": sid, "prompt": [{"type":"text","text":"early-history-marker"}]}),
+        )
+        .await;
+    let _ = h.recv_until(|v| v["id"] == json!(p0)).await;
+
+    let p1 = h
+        .send(
+            "session/prompt",
+            json!({"sessionId": sid, "prompt": [{"type":"text","text": huge}]}),
+        )
+        .await;
+    let _ = h.recv_until(|v| v["id"] == json!(p1)).await;
+
+    let captured = llm.captured.lock().await;
+    assert_eq!(captured.len(), 3, "expected prompt, handoff, prompt");
+    let handoff_messages = captured[1]["messages"].as_array().unwrap();
+    let handoff_prompt = handoff_messages[1]["content"].as_str().unwrap();
+    assert!(
+        handoff_prompt.contains("oversize-latest-marker"),
+        "latest oversized item should be kept in truncated form: {handoff_prompt}"
+    );
+    assert!(
+        handoff_prompt.contains("older items omitted"),
+        "handoff should report truncation when history exceeds budget: {handoff_prompt}"
+    );
+    h.shutdown().await;
+}
+
 /// Regression for the original bug: context fills, the provider 400s on the
 /// next request, and the handoff never fires because the old gate measured
 /// BYTES (16 MiB threshold) while the limit is in TOKENS. The fix gates on