From abb210c1fa791e070d53dcf34f828a12c73cdafb Mon Sep 17 00:00:00 2001
From: Dennisadira <dennisadira@gmail.com>
Date: Mon, 22 Jun 2026 19:04:21 +0300
Subject: [PATCH] fix(openai): stop max_tokens streaming retry loop on
 reasoning models

When a thinking model spends its entire max_tokens budget on the reasoning
block, the C++ autoparser clears the raw Response and delivers reasoning-only
ChatDeltas (no content, no tool calls). ComputeChoices' empty-response retry
then fires and regenerates from scratch up to maxRetries times, each
re-consuming the whole budget, instead of terminating with finish_reason
"length" (issue #9716).

Add a reachedTokenBudget helper and suppress both the built-in and
caller-driven retries when the completion count has reached the configured
max_tokens ceiling. Report finish_reason "length" instead of "stop" in the
streaming and non-streaming chat paths when the budget was exhausted.

Adds a deterministic regression test that counts backend invocations
(previously 6, now 1) plus boundary tests for the helper.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Dennisadira <dennisadira@gmail.com>
---
 core/http/endpoints/openai/chat.go           | 16 +++++
 core/http/endpoints/openai/constants.go      |  3 +
 core/http/endpoints/openai/inference.go      | 22 ++++++-
 core/http/endpoints/openai/inference_test.go | 67 ++++++++++++++++++++
 4 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 5f243b3f2f64..f863631f6630 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					finishReason = FinishReasonToolCalls
 				} else if toolsCalled {
 					finishReason = FinishReasonFunctionCall
+				} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
+					// Generation stopped because it hit the max_tokens ceiling
+					// rather than a natural stop — report "length" (issue #9716).
+					finishReason = FinishReasonLength
 				}
 
 				// Final delta chunk: empty delta with finish_reason set. Per
@@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}
 
+				// If generation hit the max_tokens ceiling, report "length"
+				// instead of a natural "stop" (issue #9716). Mirrors the
+				// streaming path; tool/function finish reasons are untouched.
+				if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
+					for i := range result {
+						if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
+							lengthReason := FinishReasonLength
+							result[i].FinishReason = &lengthReason
+						}
+					}
+				}
+
 				// No MCP tools to execute (or no MCP tools configured), return response
 				usage := schema.OpenAIUsage{
 					PromptTokens:     tokenUsage.Prompt,
diff --git a/core/http/endpoints/openai/constants.go b/core/http/endpoints/openai/constants.go
index bc7dae10bccb..a241bd1a777e 100644
--- a/core/http/endpoints/openai/constants.go
+++ b/core/http/endpoints/openai/constants.go
@@ -5,4 +5,7 @@ const (
 	FinishReasonStop         = "stop"
 	FinishReasonToolCalls    = "tool_calls"
 	FinishReasonFunctionCall = "function_call"
+	// FinishReasonLength is reported when generation stopped because it
+	// reached the max_tokens budget rather than a natural stop (issue #9716).
+	FinishReasonLength = "length"
 )
diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go
index bc770dce709a..29756488d092 100644
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -13,6 +13,14 @@ import (
 	"github.com/mudler/xlog"
 )
 
+// reachedTokenBudget reports whether generation stopped because it reached the
+// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
+// Used to suppress regeneration retries (which would just hit the same ceiling
+// again) and to report finish_reason "length" instead of "stop" (issue #9716).
+func reachedTokenBudget(completion int, maxTokens *int) bool {
+	return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
+}
+
 func ComputeChoices(
 	req *schema.OpenAIRequest,
 	predInput string,
@@ -113,11 +121,21 @@ func ComputeChoices(
 			}
 			prediction = p
 
+			// budgetExhausted is true when the model stopped because it reached
+			// the configured max_tokens ceiling. None of the retry paths below
+			// should fire in that case: regenerating would just hit the same
+			// ceiling again and multiply token consumption (issue #9716). A
+			// thinking model that spends its whole budget on the reasoning block
+			// produces an empty content / reasoning-only response, which would
+			// otherwise look like a failed generation worth retrying. This is a
+			// "length" finish, not an empty one.
+			budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
+
 			// Built-in: retry on truly empty response (no tokens at all).
 			// However, when the C++ autoparser is active, it clears the raw
 			// message and delivers content via ChatDeltas instead. Do NOT
 			// retry if ChatDeltas contain tool calls or content.
-			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
+			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
 				hasChatDeltaData := false
 				for _, d := range prediction.ChatDeltas {
 					if d.Content != "" || len(d.ToolCalls) > 0 {
@@ -159,7 +177,7 @@ func ComputeChoices(
 					}
 				}
 			}
-			if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
+			if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
 				// Caller has already reset its state inside shouldRetry
 				result = result[:0]
 				allChatDeltas = nil
diff --git a/core/http/endpoints/openai/inference_test.go b/core/http/endpoints/openai/inference_test.go
index a7ead8fd1d45..7debd15fcc52 100644
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
 		})
 	})
 
+	Context("reachedTokenBudget", func() {
+		ptr := func(i int) *int { return &i }
+		It("is false when no limit is configured", func() {
+			Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
+		})
+		It("is false when generation stopped below the limit", func() {
+			Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
+		})
+		It("is true when generation reached or exceeded the limit", func() {
+			Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
+			Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
+		})
+	})
+
+	Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
+		// Reproduces the streaming retry loop: when a thinking model spends its
+		// entire max_tokens budget on the reasoning block, the C++ autoparser
+		// clears the raw Response and delivers reasoning-only ChatDeltas (no
+		// content, no tool calls). The built-in empty-response retry then fires
+		// and regenerates from scratch up to maxRetries times, each re-consuming
+		// the whole budget — instead of terminating with finish_reason "length".
+		It("should NOT retry when the token budget was exhausted", func() {
+			maxTokens := 100
+			cfg.Maxtokens = &maxTokens
+
+			calls := 0
+			backend.ModelInferenceFunc = func(
+				ctx context.Context, s string, messages schema.Messages,
+				images, videos, audios []string,
+				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
+				o *config.ApplicationConfig,
+				tokenCallback func(string, backend.TokenUsage) bool,
+				tools, toolChoice string,
+				logprobs, topLogprobs *int,
+				logitBias map[string]float64,
+				metadata map[string]string,
+			) (func() (backend.LLMResponse, error), error) {
+				predFunc := func() (backend.LLMResponse, error) {
+					calls++
+					// Autoparser cleared Response; only reasoning was produced,
+					// and the completion count reached the max_tokens budget.
+					return backend.LLMResponse{
+						Response:   "",
+						ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
+						Usage:      backend.TokenUsage{Prompt: 5, Completion: maxTokens},
+					}, nil
+				}
+				return predFunc, nil
+			}
+
+			_, usage, _, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				nil,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			// The model hit its token ceiling; regenerating would just hit it
+			// again and multiply token consumption. Exactly one call expected.
+			Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
+			Expect(usage.Completion).To(Equal(maxTokens))
+		})
+	})
+
 	Context("with streaming token callback", func() {
 		It("should call tokenCallback for streaming responses", func() {
 			var streamedTokens []string