From abb210c1fa791e070d53dcf34f828a12c73cdafb Mon Sep 17 00:00:00 2001 From: Dennisadira Date: Mon, 22 Jun 2026 19:04:21 +0300 Subject: [PATCH] fix(openai): stop max_tokens streaming retry loop on reasoning models When a thinking model spends its entire max_tokens budget on the reasoning block, the C++ autoparser clears the raw Response and delivers reasoning-only ChatDeltas (no content, no tool calls). ComputeChoices' empty-response retry then fires and regenerates from scratch up to maxRetries times, each re-consuming the whole budget, instead of terminating with finish_reason "length" (issue #9716). Add a reachedTokenBudget helper and suppress both the built-in and caller-driven retries when the completion count has reached the configured max_tokens ceiling. Report finish_reason "length" instead of "stop" in the streaming and non-streaming chat paths when the budget was exhausted. Adds a deterministic regression test that counts backend invocations (previously 6, now 1) plus boundary tests for the helper. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Dennisadira --- core/http/endpoints/openai/chat.go | 16 +++++ core/http/endpoints/openai/constants.go | 3 + core/http/endpoints/openai/inference.go | 22 ++++++- core/http/endpoints/openai/inference_test.go | 67 ++++++++++++++++++++ 4 files changed, 106 insertions(+), 2 deletions(-) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 5f243b3f2f64..f863631f6630 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator finishReason = FinishReasonToolCalls } else if toolsCalled { finishReason = FinishReasonFunctionCall + } else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) { + // Generation stopped because it hit the max_tokens ceiling + // rather than a natural stop — report "length" (issue #9716). + finishReason = FinishReasonLength } // Final delta chunk: empty delta with finish_reason set. Per @@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator } } + // If generation hit the max_tokens ceiling, report "length" + // instead of a natural "stop" (issue #9716). Mirrors the + // streaming path; tool/function finish reasons are untouched. + if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) { + for i := range result { + if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop { + lengthReason := FinishReasonLength + result[i].FinishReason = &lengthReason + } + } + } + // No MCP tools to execute (or no MCP tools configured), return response usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, diff --git a/core/http/endpoints/openai/constants.go b/core/http/endpoints/openai/constants.go index bc7dae10bccb..a241bd1a777e 100644 --- a/core/http/endpoints/openai/constants.go +++ b/core/http/endpoints/openai/constants.go @@ -5,4 +5,7 @@ const ( FinishReasonStop = "stop" FinishReasonToolCalls = "tool_calls" FinishReasonFunctionCall = "function_call" + // FinishReasonLength is reported when generation stopped because it + // reached the max_tokens budget rather than a natural stop (issue #9716). + FinishReasonLength = "length" ) diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index bc770dce709a..29756488d092 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -13,6 +13,14 @@ import ( "github.com/mudler/xlog" ) +// reachedTokenBudget reports whether generation stopped because it reached the +// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit". +// Used to suppress regeneration retries (which would just hit the same ceiling +// again) and to report finish_reason "length" instead of "stop" (issue #9716). +func reachedTokenBudget(completion int, maxTokens *int) bool { + return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens +} + func ComputeChoices( req *schema.OpenAIRequest, predInput string, @@ -113,11 +121,21 @@ func ComputeChoices( } prediction = p + // budgetExhausted is true when the model stopped because it reached + // the configured max_tokens ceiling. None of the retry paths below + // should fire in that case: regenerating would just hit the same + // ceiling again and multiply token consumption (issue #9716). A + // thinking model that spends its whole budget on the reasoning block + // produces an empty content / reasoning-only response, which would + // otherwise look like a failed generation worth retrying. This is a + // "length" finish, not an empty one. + budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens) + // Built-in: retry on truly empty response (no tokens at all). // However, when the C++ autoparser is active, it clears the raw // message and delivers content via ChatDeltas instead. Do NOT // retry if ChatDeltas contain tool calls or content. - if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries { + if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted { hasChatDeltaData := false for _, d := range prediction.ChatDeltas { if d.Content != "" || len(d.ToolCalls) > 0 { @@ -159,7 +177,7 @@ func ComputeChoices( } } } - if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries { + if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries { // Caller has already reset its state inside shouldRetry result = result[:0] allChatDeltas = nil diff --git a/core/http/endpoints/openai/inference_test.go b/core/http/endpoints/openai/inference_test.go index a7ead8fd1d45..7debd15fcc52 100644 --- a/core/http/endpoints/openai/inference_test.go +++ b/core/http/endpoints/openai/inference_test.go @@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() { }) }) + Context("reachedTokenBudget", func() { + ptr := func(i int) *int { return &i } + It("is false when no limit is configured", func() { + Expect(reachedTokenBudget(1000, nil)).To(BeFalse()) + Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse()) + Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse()) + }) + It("is false when generation stopped below the limit", func() { + Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse()) + }) + It("is true when generation reached or exceeded the limit", func() { + Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue()) + Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue()) + }) + }) + + Context("max_tokens budget exhausted on reasoning (issue #9716)", func() { + // Reproduces the streaming retry loop: when a thinking model spends its + // entire max_tokens budget on the reasoning block, the C++ autoparser + // clears the raw Response and delivers reasoning-only ChatDeltas (no + // content, no tool calls). The built-in empty-response retry then fires + // and regenerates from scratch up to maxRetries times, each re-consuming + // the whole budget — instead of terminating with finish_reason "length". + It("should NOT retry when the token budget was exhausted", func() { + maxTokens := 100 + cfg.Maxtokens = &maxTokens + + calls := 0 + backend.ModelInferenceFunc = func( + ctx context.Context, s string, messages schema.Messages, + images, videos, audios []string, + loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, + o *config.ApplicationConfig, + tokenCallback func(string, backend.TokenUsage) bool, + tools, toolChoice string, + logprobs, topLogprobs *int, + logitBias map[string]float64, + metadata map[string]string, + ) (func() (backend.LLMResponse, error), error) { + predFunc := func() (backend.LLMResponse, error) { + calls++ + // Autoparser cleared Response; only reasoning was produced, + // and the completion count reached the max_tokens budget. + return backend.LLMResponse{ + Response: "", + ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}}, + Usage: backend.TokenUsage{Prompt: 5, Completion: maxTokens}, + }, nil + } + return predFunc, nil + } + + _, usage, _, err := ComputeChoices( + makeReq(), "test", cfg, nil, appCfg, nil, + func(s string, c *[]schema.Choice) { + *c = append(*c, schema.Choice{Text: s}) + }, + nil, + ) + Expect(err).ToNot(HaveOccurred()) + // The model hit its token ceiling; regenerating would just hit it + // again and multiply token consumption. Exactly one call expected. + Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried") + Expect(usage.Completion).To(Equal(maxTokens)) + }) + }) + Context("with streaming token callback", func() { It("should call tokenCallback for streaming responses", func() { var streamedTokens []string