Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
finishReason = FinishReasonToolCalls
} else if toolsCalled {
finishReason = FinishReasonFunctionCall
} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
// Generation stopped because it hit the max_tokens ceiling
// rather than a natural stop — report "length" (issue #9716).
finishReason = FinishReasonLength
}

// Final delta chunk: empty delta with finish_reason set. Per
Expand Down Expand Up @@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
}
}

// If generation hit the max_tokens ceiling, report "length"
// instead of a natural "stop" (issue #9716). Mirrors the
// streaming path; tool/function finish reasons are untouched.
if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
for i := range result {
if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
lengthReason := FinishReasonLength
result[i].FinishReason = &lengthReason
}
}
}

// No MCP tools to execute (or no MCP tools configured), return response
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,
Expand Down
3 changes: 3 additions & 0 deletions core/http/endpoints/openai/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ const (
FinishReasonStop = "stop"
FinishReasonToolCalls = "tool_calls"
FinishReasonFunctionCall = "function_call"
// FinishReasonLength is reported when generation stopped because it
// reached the max_tokens budget rather than a natural stop (issue #9716).
FinishReasonLength = "length"
)
22 changes: 20 additions & 2 deletions core/http/endpoints/openai/inference.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ import (
"github.com/mudler/xlog"
)

// reachedTokenBudget reports whether generation stopped because it reached the
// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
// Used to suppress regeneration retries (which would just hit the same ceiling
// again) and to report finish_reason "length" instead of "stop" (issue #9716).
func reachedTokenBudget(completion int, maxTokens *int) bool {
return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
}

func ComputeChoices(
req *schema.OpenAIRequest,
predInput string,
Expand Down Expand Up @@ -113,11 +121,21 @@ func ComputeChoices(
}
prediction = p

// budgetExhausted is true when the model stopped because it reached
// the configured max_tokens ceiling. None of the retry paths below
// should fire in that case: regenerating would just hit the same
// ceiling again and multiply token consumption (issue #9716). A
// thinking model that spends its whole budget on the reasoning block
// produces an empty content / reasoning-only response, which would
// otherwise look like a failed generation worth retrying. This is a
// "length" finish, not an empty one.
budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)

// Built-in: retry on truly empty response (no tokens at all).
// However, when the C++ autoparser is active, it clears the raw
// message and delivers content via ChatDeltas instead. Do NOT
// retry if ChatDeltas contain tool calls or content.
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
hasChatDeltaData := false
for _, d := range prediction.ChatDeltas {
if d.Content != "" || len(d.ToolCalls) > 0 {
Expand Down Expand Up @@ -159,7 +177,7 @@ func ComputeChoices(
}
}
}
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
// Caller has already reset its state inside shouldRetry
result = result[:0]
allChatDeltas = nil
Expand Down
67 changes: 67 additions & 0 deletions core/http/endpoints/openai/inference_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
})
})

Context("reachedTokenBudget", func() {
ptr := func(i int) *int { return &i }
It("is false when no limit is configured", func() {
Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
})
It("is false when generation stopped below the limit", func() {
Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
})
It("is true when generation reached or exceeded the limit", func() {
Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
})
})

Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
// Reproduces the streaming retry loop: when a thinking model spends its
// entire max_tokens budget on the reasoning block, the C++ autoparser
// clears the raw Response and delivers reasoning-only ChatDeltas (no
// content, no tool calls). The built-in empty-response retry then fires
// and regenerates from scratch up to maxRetries times, each re-consuming
// the whole budget — instead of terminating with finish_reason "length".
It("should NOT retry when the token budget was exhausted", func() {
maxTokens := 100
cfg.Maxtokens = &maxTokens

calls := 0
backend.ModelInferenceFunc = func(
ctx context.Context, s string, messages schema.Messages,
images, videos, audios []string,
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
o *config.ApplicationConfig,
tokenCallback func(string, backend.TokenUsage) bool,
tools, toolChoice string,
logprobs, topLogprobs *int,
logitBias map[string]float64,
metadata map[string]string,
) (func() (backend.LLMResponse, error), error) {
predFunc := func() (backend.LLMResponse, error) {
calls++
// Autoparser cleared Response; only reasoning was produced,
// and the completion count reached the max_tokens budget.
return backend.LLMResponse{
Response: "",
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
Usage: backend.TokenUsage{Prompt: 5, Completion: maxTokens},
}, nil
}
return predFunc, nil
}

_, usage, _, err := ComputeChoices(
makeReq(), "test", cfg, nil, appCfg, nil,
func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
},
nil,
)
Expect(err).ToNot(HaveOccurred())
// The model hit its token ceiling; regenerating would just hit it
// again and multiply token consumption. Exactly one call expected.
Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
Expect(usage.Completion).To(Equal(maxTokens))
})
})

Context("with streaming token callback", func() {
It("should call tokenCallback for streaming responses", func() {
var streamedTokens []string
Expand Down
Loading