From daedfc2dac7636ced810a8ad70bac123a216afb8 Mon Sep 17 00:00:00 2001 From: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co> Date: Sat, 13 Jun 2026 15:30:12 -0400 Subject: [PATCH] Fix handoff summary history budget Co-authored-by: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co> Signed-off-by: npub1jh9wn95s0472h86ahapupaf7m6kx4v9sx2n0atj2hltcfer8k06s5n3pyf <95cae996907d7cab9f5dbf43c0f53edeac6ab0b032a6feae4abfd784e467b3f5@sprout-oss.stage.blox.sqprod.co> --- crates/buzz-agent/src/config.rs | 4 - crates/buzz-agent/src/handoff.rs | 98 +++++++++++++------ crates/buzz-agent/tests/regressions.rs | 126 +++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 33 deletions(-) diff --git a/crates/buzz-agent/src/config.rs b/crates/buzz-agent/src/config.rs index b8ed1a099..1d5a3df3b 100644 --- a/crates/buzz-agent/src/config.rs +++ b/crates/buzz-agent/src/config.rs @@ -18,12 +18,8 @@ pub const MAX_TOOL_CALLS_PER_TURN: usize = 64; pub const HANDOFF_MAX_OUTPUT_TOKENS: u32 = 8192; -pub const HANDOFF_TAIL_ITEMS: usize = 5; - pub const HANDOFF_ORIGINAL_TASK_MAX_BYTES: usize = 16 * 1024; -pub const HANDOFF_PROMPT_MAX_BYTES: usize = 32 * 1024; - pub const HANDOFF_MAX_TOOL_NAMES: usize = 20; const DEFAULT_SYSTEM_PROMPT: &str = diff --git a/crates/buzz-agent/src/handoff.rs b/crates/buzz-agent/src/handoff.rs index 619586ac7..65b733538 100644 --- a/crates/buzz-agent/src/handoff.rs +++ b/crates/buzz-agent/src/handoff.rs @@ -1,7 +1,6 @@ use crate::agent::RunCtx; use crate::config::{ HANDOFF_MAX_OUTPUT_TOKENS, HANDOFF_MAX_TOOL_NAMES, HANDOFF_ORIGINAL_TASK_MAX_BYTES, - HANDOFF_PROMPT_MAX_BYTES, HANDOFF_TAIL_ITEMS, }; use crate::types::HistoryItem; @@ -16,8 +15,6 @@ turn of an autonomous agent. Be concise but thorough. Cover: what the original t you accomplished, key decisions made, what remains, and one concrete next step. Output plain \ text only — no tool calls, no JSON. Stay under 8192 tokens."; -const HANDOFF_SNIPPET_BYTES: usize = 2048; - impl RunCtx<'_> { pub(crate) async fn maybe_handoff(&mut self) -> HandoffOutcome { if !self.should_handoff() { @@ -160,32 +157,46 @@ impl RunCtx<'_> { Produce a context handoff summary covering: (1) original task, \ (2) what was accomplished, (3) key decisions, (4) what remains, \ (5) one concrete next step. Be concise but thorough. Plain text.\n"; - let history_header = "\n# Recent History (most recent last)\n"; - - let start = self.history.len().saturating_sub(HANDOFF_TAIL_ITEMS); - let mut snippets: Vec = self.history[start..] - .iter() - .map(|item| { - let mut s = String::new(); - push_history_snippet(&mut s, item); - s - }) - .collect(); + let history_header = "\n# Session History (oldest first)\n"; + let prompt_budget = handoff_prompt_budget_bytes( + self.cfg.max_context_tokens, + HANDOFF_MAX_OUTPUT_TOKENS, + head.len() + history_header.len() + tail.len(), + ); - let fixed = head.len() + history_header.len() + tail.len(); - let mut snippets_bytes: usize = snippets.iter().map(String::len).sum(); + let mut snippets: Vec = Vec::new(); + let mut snippets_bytes = 0usize; let mut dropped = 0usize; - while fixed + snippets_bytes > HANDOFF_PROMPT_MAX_BYTES && !snippets.is_empty() { - let removed = snippets.remove(0); - snippets_bytes -= removed.len(); - dropped += 1; + for item in self.history.iter().rev() { + let mut snippet = String::new(); + push_history_snippet(&mut snippet, item); + let snippet_bytes = snippet.len(); + if snippets_bytes.saturating_add(snippet_bytes) > prompt_budget { + if snippets.is_empty() { + snippets.push(clamp_bytes(&snippet, prompt_budget)); + snippets_bytes = prompt_budget; + } + dropped += 1; + continue; + } + snippets_bytes += snippet_bytes; + snippets.push(snippet); } + snippets.reverse(); if dropped > 0 { - tracing::info!("handoff prompt cap, dropped {dropped} oldest snippets"); + tracing::info!( + "handoff prompt budget, dropped {dropped} oldest snippets; kept {} bytes", + snippets_bytes + ); } - let mut out = - String::with_capacity(fixed + snippets_bytes + if dropped > 0 { 32 } else { 0 }); + let mut out = String::with_capacity( + head.len() + + history_header.len() + + tail.len() + + snippets_bytes + + if dropped > 0 { 32 } else { 0 }, + ); out.push_str(&head); out.push_str(history_header); if dropped > 0 { @@ -211,13 +222,13 @@ fn push_history_snippet(out: &mut String, item: &HistoryItem) { match item { HistoryItem::User(s) => { out.push_str("[user] "); - out.push_str(&clamp_for_snippet(s)); + out.push_str(s); out.push('\n'); } HistoryItem::Assistant { text, tool_calls } => { out.push_str("[assistant] "); if !text.is_empty() { - out.push_str(&clamp_for_snippet(text)); + out.push_str(text); } for c in tool_calls { out.push_str(&format!(" tool:{}", c.name)); @@ -226,14 +237,30 @@ fn push_history_snippet(out: &mut String, item: &HistoryItem) { } HistoryItem::ToolResult(r) => { out.push_str(if r.is_error { "[tool_err] " } else { "[tool] " }); - out.push_str(&clamp_for_snippet(&r.text())); + out.push_str(&r.text()); out.push('\n'); } } } -fn clamp_for_snippet(s: &str) -> String { - clamp_bytes(s, HANDOFF_SNIPPET_BYTES) +/// Byte budget for session-history text inside the handoff prompt. The +/// summarizer uses the same provider/model config as normal completion, so +/// derive the input budget from the model context window instead of applying a +/// separate fixed prompt cap. We keep the same 1 byte/token upper-bound +/// estimate used by the handoff gate, which is conservative: it may drop old +/// history early for unusually large sessions, but it should not build a prompt +/// that exceeds the configured context window. +fn handoff_prompt_budget_bytes( + max_context_tokens: u64, + max_output_tokens: u32, + fixed_prompt_bytes: usize, +) -> usize { + max_context_tokens + .saturating_sub(u64::from(max_output_tokens)) + .saturating_mul(CONSERVATIVE_BYTES_PER_TOKEN) + .saturating_sub(u64::try_from(fixed_prompt_bytes).unwrap_or(u64::MAX)) + .try_into() + .unwrap_or(usize::MAX) } pub(crate) fn clamp_bytes(s: &str, max_bytes: usize) -> String { @@ -299,7 +326,20 @@ fn byte_fallback_threshold( #[cfg(test)] mod tests { - use super::{byte_fallback_threshold, estimate_tokens_from_bytes, token_threshold}; + use super::{ + byte_fallback_threshold, estimate_tokens_from_bytes, handoff_prompt_budget_bytes, + token_threshold, + }; + + #[test] + fn handoff_prompt_budget_reserves_summary_output_and_fixed_prompt() { + assert_eq!(handoff_prompt_budget_bytes(25_000, 8_192, 1_000), 15_808); + } + + #[test] + fn handoff_prompt_budget_saturates_when_fixed_prompt_exceeds_window() { + assert_eq!(handoff_prompt_budget_bytes(1_000, 2_000, 10_000), 0); + } #[test] fn token_threshold_uses_fraction_when_output_is_small() { diff --git a/crates/buzz-agent/tests/regressions.rs b/crates/buzz-agent/tests/regressions.rs index 3fdaa9c59..414eccef5 100644 --- a/crates/buzz-agent/tests/regressions.rs +++ b/crates/buzz-agent/tests/regressions.rs @@ -1132,6 +1132,132 @@ async fn hook_post_compact_injects_after_handoff() { h.shutdown().await; } +/// The handoff summary prompt should include all session history when that +/// history fits the summarizer context budget. This protects against regressing +/// to the old fixed tail of five tiny snippets. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn handoff_summary_prompt_includes_full_history_within_context_budget() { + let llm = spawn_capturing_llm(vec![ + openai_text_with_usage("ack-0", 9500), + openai_text("handoff summary text"), + openai_text_with_usage("done", 10), + ]) + .await; + let mut h = Harness::spawn_with_env( + &llm.url, + &[ + ("BUZZ_AGENT_MAX_CONTEXT_TOKENS", "10000"), + ("BUZZ_AGENT_MAX_OUTPUT_TOKENS", "1000"), + ("BUZZ_AGENT_MAX_HANDOFFS", "3"), + ( + "BUZZ_AGENT_MAX_HISTORY_BYTES", + &(16 * 1024 * 1024).to_string(), + ), + ], + ) + .await; + let sid = init_session(&mut h, json!([])).await; + + let p0 = h + .send( + "session/prompt", + json!({"sessionId": sid, "prompt": [{"type":"text","text":"early-history-marker"}]}), + ) + .await; + let _ = h.recv_until(|v| v["id"] == json!(p0)).await; + + let p1 = h + .send( + "session/prompt", + json!({"sessionId": sid, "prompt": [{"type":"text","text":"late-history-marker"}]}), + ) + .await; + let _ = h.recv_until(|v| v["id"] == json!(p1)).await; + + let captured = llm.captured.lock().await; + assert_eq!(captured.len(), 3, "expected prompt, handoff, prompt"); + let handoff_messages = captured[1]["messages"].as_array().unwrap(); + let handoff_prompt = handoff_messages[1]["content"].as_str().unwrap(); + assert!( + handoff_prompt.contains("# Session History (oldest first)"), + "handoff prompt should describe full session history: {handoff_prompt}" + ); + assert!( + handoff_prompt.contains("early-history-marker"), + "oldest prompt was omitted despite fitting budget: {handoff_prompt}" + ); + assert!( + handoff_prompt.contains("ack-0"), + "assistant response was omitted despite fitting budget: {handoff_prompt}" + ); + assert!( + handoff_prompt.contains("late-history-marker"), + "latest prompt was omitted despite fitting budget: {handoff_prompt}" + ); + assert!( + !handoff_prompt.contains("older items omitted"), + "handoff should not report truncation when full history fits: {handoff_prompt}" + ); + h.shutdown().await; +} + +/// If one item is larger than the derived summarizer budget, keep a truncated +/// form of the most recent item instead of sending an empty history block. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn handoff_summary_prompt_keeps_latest_item_when_one_item_exceeds_budget() { + let llm = spawn_capturing_llm(vec![ + openai_text_with_usage("ack-0", 9500), + openai_text("handoff summary text"), + openai_text_with_usage("done", 10), + ]) + .await; + let mut h = Harness::spawn_with_env( + &llm.url, + &[ + ("BUZZ_AGENT_MAX_CONTEXT_TOKENS", "10000"), + ("BUZZ_AGENT_MAX_OUTPUT_TOKENS", "1000"), + ("BUZZ_AGENT_MAX_HANDOFFS", "3"), + ( + "BUZZ_AGENT_MAX_HISTORY_BYTES", + &(16 * 1024 * 1024).to_string(), + ), + ], + ) + .await; + let sid = init_session(&mut h, json!([])).await; + + let huge = format!("oversize-latest-marker {}", "x".repeat(12000)); + let p0 = h + .send( + "session/prompt", + json!({"sessionId": sid, "prompt": [{"type":"text","text":"early-history-marker"}]}), + ) + .await; + let _ = h.recv_until(|v| v["id"] == json!(p0)).await; + + let p1 = h + .send( + "session/prompt", + json!({"sessionId": sid, "prompt": [{"type":"text","text": huge}]}), + ) + .await; + let _ = h.recv_until(|v| v["id"] == json!(p1)).await; + + let captured = llm.captured.lock().await; + assert_eq!(captured.len(), 3, "expected prompt, handoff, prompt"); + let handoff_messages = captured[1]["messages"].as_array().unwrap(); + let handoff_prompt = handoff_messages[1]["content"].as_str().unwrap(); + assert!( + handoff_prompt.contains("oversize-latest-marker"), + "latest oversized item should be kept in truncated form: {handoff_prompt}" + ); + assert!( + handoff_prompt.contains("older items omitted"), + "handoff should report truncation when history exceeds budget: {handoff_prompt}" + ); + h.shutdown().await; +} + /// Regression for the original bug: context fills, the provider 400s on the /// next request, and the handoff never fires because the old gate measured /// BYTES (16 MiB threshold) while the limit is in TOKENS. The fix gates on