Skip to content

Commit f76f479

Browse files
unamedkrclaude
andcommitted
fix(chat-cache): second audit pass — 9 more bugs eliminated
Follow-up to PR #52. A fresh code-reading audit found another batch of hidden bugs in the chat KV cache path. None had visible symptoms in the happy path; all were latent failure modes that would surface under load, on long histories, or on memory pressure. Bugs fixed: CRITICAL - B1: tq_generate_continue's sliding-window truncation silently desynced cached_text. cached_text claimed the FULL prompt was committed, but cached_tokens only held the truncated tail — next turn's text-prefix match mapped text bytes to the wrong KV positions. Fix: continue now returns -2 on overflow instead of truncating. - B2: cached_text was updated even when generation returned an error, leaving the cache claiming committed bytes that weren't. - B3: chat_accum_callback realloc failure silently dropped tokens AND skipped the user's stream callback — broken UX + corrupted cached_text. Fix: always pass tokens to user_cb; mark accumulator tainted on realloc failure; skip cached_text update if tainted. - B4: server's get_or_create_session didn't NULL-check tq_create_state_ex. An OOM made the next call dereference a NULL kv_state. - B5: CLI cmd_run interactive loop ignored quant_chat return code, so context overflow produced an infinite stream of empty replies. Fix: catch ChatContextOverflow, drop oldest turn, retry. HIGH - B6: server streaming path only branched on rc == -2; rc == -1 produced HTTP 200 with finish_reason "stop" and no error info. Now sends an error delta + finish_reason "error". - B7: server reused an existing session even when the request changed kv_type / value_quant_bits — old quantized blocks would be misinterpreted. Now detects the change and rebuilds state. - B8: WASM wasm_load_model didn't reset g_generating. After a page-reload mid-stream, every subsequent generate call early-returned -1 forever. - B9: rep_penalty was silently ignored in tq_generate_chat_text's FAST path (slow path applied it). Now mirrors the slow path. - B10: Python Model.chat() ignored the C return value; -2 / -1 surfaced as empty token streams. Now raises ChatContextOverflow / RuntimeError. MEDIUM - Removed dead `update_cache:` label. - Synced bindings/python/quant.h (sdist staging snapshot). Verification: - ctest --test-dir build → 35/35 passed - cmake --build build → all targets clean - wasm/build.sh → quant.wasm rebuilt clean (320K) - Python imports + cli --help work quant.h and src/engine/tq_generate.c are kept in lockstep (every chat-cache change applied to both). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 05b7e11 commit f76f479

File tree

8 files changed

+1056
-222
lines changed

8 files changed

+1056
-222
lines changed

bindings/python/quant.h

Lines changed: 672 additions & 151 deletions
Large diffs are not rendered by default.

bindings/python/quantcpp/__init__.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@
3535
)
3636

3737

38+
class ChatContextOverflow(RuntimeError):
39+
"""Raised when chat history exceeds the model's context window.
40+
41+
The C side has already auto-reset the session by the time this is
42+
raised — the caller must trim its conversation history (drop the
43+
oldest turns) and retry. Catching this is the supported way to
44+
detect "we hit max_seq_len" without parsing log output.
45+
"""
46+
47+
3848
# -----------------------------------------------------------------------
3949
# Model registry — small GGUF models auto-downloaded from HuggingFace
4050
# -----------------------------------------------------------------------
@@ -394,6 +404,15 @@ def chat(self, prompt: str) -> Iterator[str]:
394404
395405
Falls back to ``generate()`` on older library builds without
396406
``quant_chat`` symbol.
407+
408+
Raises
409+
------
410+
ChatContextOverflow
411+
When the conversation history exceeds the model's context
412+
window. The session has been auto-reset; the caller should
413+
trim history and retry.
414+
RuntimeError
415+
On other generation failures (allocation, invalid state).
397416
"""
398417
self._ensure_open()
399418
lib = get_lib()
@@ -414,6 +433,7 @@ def chat(self, prompt: str) -> Iterator[str]:
414433
tokens = []
415434
done = threading.Event()
416435
error_box = [None]
436+
rc_box = [0]
417437

418438
def _on_token(text_ptr, _user_data):
419439
if text_ptr:
@@ -424,7 +444,8 @@ def _on_token(text_ptr, _user_data):
424444
def _run():
425445
try:
426446
with self._lock:
427-
lib.quant_chat(self._ctx, prompt.encode("utf-8"), cb, None)
447+
rc_box[0] = lib.quant_chat(
448+
self._ctx, prompt.encode("utf-8"), cb, None)
428449
except Exception as e:
429450
error_box[0] = e
430451
finally:
@@ -448,6 +469,19 @@ def _run():
448469
if error_box[0] is not None:
449470
raise error_box[0]
450471

472+
# Surface generation failures from the C side. Previously these
473+
# were silently swallowed: -2 (context overflow) and -1 (alloc
474+
# failure) both produced empty token streams that callers could
475+
# not distinguish from "the model decided to say nothing".
476+
rc = rc_box[0]
477+
if rc == -2:
478+
raise ChatContextOverflow(
479+
"conversation history exceeds the model's context window — "
480+
"session has been reset, retry with shorter history"
481+
)
482+
if rc < 0:
483+
raise RuntimeError(f"quant_chat failed with rc={rc}")
484+
451485
def reset_chat(self) -> None:
452486
"""Reset the chat KV cache. Next chat() call starts fresh."""
453487
self._ensure_open()
@@ -528,4 +562,4 @@ def load(path: str, **kwargs) -> Model:
528562
return Model(path, **kwargs)
529563

530564

531-
__all__ = ["Model", "load", "download", "__version__"]
565+
__all__ = ["Model", "load", "download", "ChatContextOverflow", "__version__"]

bindings/python/quantcpp/cli.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,24 +151,63 @@ def cmd_run(args):
151151
print(tok, end="", flush=True)
152152
print()
153153
else:
154+
from quantcpp import ChatContextOverflow
154155
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
155156
# Multi-turn chat: accumulate history as ChatML so the model sees
156157
# prior turns. m.chat() reuses the KV cache via prefix-match, so
157158
# repeating the history is cheap (O(new tokens), not O(n^2)).
158-
history = ""
159+
# turns is a list of (user_msg, assistant_msg) pairs so we can
160+
# trim from the front when we hit context overflow.
161+
turns = []
162+
def _build_history(extra_user=None):
163+
parts = []
164+
for u, a in turns:
165+
parts.append(f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n")
166+
if extra_user is not None:
167+
parts.append(f"<|im_start|>user\n{extra_user}<|im_end|>\n<|im_start|>assistant\n")
168+
return "".join(parts)
169+
159170
try:
160171
while True:
161172
question = input("\nYou: ")
162173
if not question.strip():
163174
continue
164-
history += f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
165175
print("AI: ", end="", flush=True)
166176
reply_buf = []
167-
for tok in m.chat(history):
168-
print(tok, end="", flush=True)
169-
reply_buf.append(tok)
177+
# Retry loop: on context overflow, drop the oldest turn
178+
# and try again. Without this, the C side resets the KV
179+
# cache but Python's history still has the bloat, so
180+
# every subsequent turn would loop back into overflow.
181+
attempt = 0
182+
while True:
183+
history = _build_history(extra_user=question)
184+
try:
185+
for tok in m.chat(history):
186+
print(tok, end="", flush=True)
187+
reply_buf.append(tok)
188+
break
189+
except ChatContextOverflow:
190+
if not turns:
191+
print("\n[chat] message alone exceeds context window — try a shorter question.",
192+
file=sys.stderr)
193+
reply_buf = [] # nothing was emitted
194+
break
195+
dropped = turns.pop(0)
196+
attempt += 1
197+
print(f"\n[chat] context full \u2014 dropped oldest turn ({len(dropped[0])+len(dropped[1])} chars), retrying...",
198+
file=sys.stderr)
199+
# The session was already reset by the C side;
200+
# retrying with the trimmed history will hit
201+
# the slow path on this turn and the fast path
202+
# again from the next turn onward.
203+
if attempt > 8:
204+
print("[chat] too many overflow retries, giving up on this turn.",
205+
file=sys.stderr)
206+
reply_buf = []
207+
break
170208
print()
171-
history += "".join(reply_buf) + "<|im_end|>\n"
209+
if reply_buf:
210+
turns.append((question, "".join(reply_buf)))
172211
except (KeyboardInterrupt, EOFError):
173212
print("\nBye!", file=sys.stderr)
174213

quant.h

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15695,17 +15695,23 @@ int tq_generate_continue(tq_model_t* model,
1569515695
n_new = 1;
1569615696
}
1569715697

15698-
/* Sliding window: drop oldest prompt tokens if the new prompt would
15699-
* leave no room for max_tokens of generation. Keeps the most recent
15700-
* tokens. Forces full reprefill since the prefix shifted. */
15698+
/* Overflow check: reject prompts that won't fit. The previous
15699+
* behavior was to silently drop oldest tokens via a sliding window,
15700+
* but that desynced any cached_text the higher-level wrapper held
15701+
* (cached_text claimed the full prompt, while cached_tokens only
15702+
* had the truncated tail — next turn's text-prefix match would
15703+
* map text bytes to the wrong KV positions). Returning -2 lets the
15704+
* caller decide (reset chat, show error). */
1570115705
int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
1570215706
int budget = max_prompt - reserve - 32;
1570315707
if (budget < 64) budget = 64;
1570415708
if (n_new > budget) {
15705-
int drop = n_new - budget;
15706-
memmove(new_tokens, new_tokens + drop, (size_t)budget * sizeof(int));
15707-
n_new = budget;
15708-
*n_cached_io = 0;
15709+
free(new_tokens);
15710+
if (getenv("TQ_CHAT_DEBUG")) {
15711+
fprintf(stderr, "[chat] OVERFLOW n_new=%d budget=%d max=%d\n",
15712+
n_new, budget, max_prompt);
15713+
}
15714+
return -2;
1570915715
}
1571015716

1571115717
/* Find longest common prefix with the cached tokens.
@@ -15872,25 +15878,30 @@ typedef struct {
1587215878
char* buf;
1587315879
size_t len;
1587415880
size_t cap;
15881+
int tainted; /* 1 if accumulation ever failed → buf is incomplete */
1587515882
void (*user_cb)(const char*, void*);
1587615883
void* user_data;
1587715884
} chat_accum_t;
1587815885

1587915886
static void chat_accum_callback(const char* tok, void* u) {
1588015887
chat_accum_t* ctx = (chat_accum_t*)u;
1588115888
if (!tok) return;
15889+
/* Always pass through to the user's callback first — losing tokens
15890+
* from the user's stream because of an INTERNAL realloc failure is
15891+
* far worse than a stale cached_text on the next turn. */
15892+
if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
15893+
if (ctx->tainted) return;
1588215894
size_t tlen = strlen(tok);
1588315895
if (ctx->len + tlen + 1 > ctx->cap) {
1588415896
size_t new_cap = (ctx->cap + tlen + 64) * 2;
1588515897
char* nb = (char*)realloc(ctx->buf, new_cap);
15886-
if (!nb) return;
15898+
if (!nb) { ctx->tainted = 1; return; }
1588715899
ctx->buf = nb;
1588815900
ctx->cap = new_cap;
1588915901
}
1589015902
memcpy(ctx->buf + ctx->len, tok, tlen);
1589115903
ctx->len += tlen;
1589215904
ctx->buf[ctx->len] = '\0';
15893-
if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
1589415905
}
1589515906

1589615907
int tq_generate_chat_text(tq_model_t* model,
@@ -15918,7 +15929,7 @@ int tq_generate_chat_text(tq_model_t* model,
1591815929
}
1591915930
}
1592015931

15921-
chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0,
15932+
chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
1592215933
.user_cb = config->on_token,
1592315934
.user_data = config->user_data };
1592415935
void (*orig_cb)(const char*, void*) = config->on_token;
@@ -15982,12 +15993,35 @@ int tq_generate_chat_text(tq_model_t* model,
1598215993
matched_text_len, n_suffix);
1598315994
}
1598415995

15985-
/* Generation loop */
15996+
/* Generation loop — mirrors tq_generate_continue including
15997+
* rep_penalty (which the fast path was silently dropping). */
1598615998
int vocab_size = model->config.vocab_size;
1598715999
int n_cached = *n_cached_io;
1598816000
int pos = n_cached;
1598916001
int prev_token = n_cached > 0 ? cached[n_cached - 1] : 1;
1599016002

16003+
float rep_penalty = config->rep_penalty;
16004+
int rep_window = config->rep_window;
16005+
if (rep_window > 64) rep_window = 64;
16006+
int recent_tokens[64];
16007+
int recent_count = 0;
16008+
for (int i = (n_cached > rep_window ? n_cached - rep_window : 0); i < n_cached; i++) {
16009+
recent_tokens[recent_count % 64] = cached[i];
16010+
recent_count++;
16011+
}
16012+
if (rep_penalty > 1.0f) {
16013+
int window = recent_count < rep_window ? recent_count : rep_window;
16014+
for (int r = 0; r < window; r++) {
16015+
int idx = (recent_count - 1 - r) % 64;
16016+
if (idx < 0) idx += 64;
16017+
int tok = recent_tokens[idx];
16018+
if (tok >= 0 && tok < vocab_size && state->logits) {
16019+
if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
16020+
else state->logits[tok] *= rep_penalty;
16021+
}
16022+
}
16023+
}
16024+
1599116025
uint64_t rng_state = config->rng_seed
1599216026
? (uint64_t)config->rng_seed : (uint64_t)time(NULL);
1599316027
int next_token = tq_sample_topp(state->logits, vocab_size,
@@ -16033,9 +16067,24 @@ int tq_generate_chat_text(tq_model_t* model,
1603316067
pos++;
1603416068
generated++;
1603516069

16070+
if (rep_penalty > 1.0f) {
16071+
int window = recent_count < rep_window ? recent_count : rep_window;
16072+
for (int r = 0; r < window; r++) {
16073+
int idx = (recent_count - 1 - r) % 64;
16074+
if (idx < 0) idx += 64;
16075+
int tok = recent_tokens[idx];
16076+
if (tok >= 0 && tok < vocab_size) {
16077+
if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
16078+
else state->logits[tok] *= rep_penalty;
16079+
}
16080+
}
16081+
}
16082+
1603616083
next_token = tq_sample_topp(state->logits, vocab_size,
1603716084
config->temperature, config->top_p,
1603816085
&rng_state);
16086+
recent_tokens[recent_count % 64] = next_token;
16087+
recent_count++;
1603916088
}
1604016089

1604116090
if (output && output_size > 0) {
@@ -16051,21 +16100,35 @@ int tq_generate_chat_text(tq_model_t* model,
1605116100
output, output_size);
1605216101
}
1605316102

16054-
update_cache:
1605516103
config->on_token = orig_cb;
1605616104
config->user_data = orig_ud;
1605716105

16106+
/* Update cached_text only if we know the KV state corresponds
16107+
* EXACTLY to (prompt + accum.buf):
16108+
* - generated >= 0: generation didn't error out
16109+
* - !accum.tainted: every generated token was captured
16110+
* On any failure, clear cached_text so the next call falls through
16111+
* to the slow path with a clean slate instead of trusting bytes
16112+
* that don't match the KV cache. */
1605816113
if (cached_text_io) {
16059-
size_t plen = strlen(prompt);
16060-
size_t glen = accum.len;
16061-
size_t new_len = plen + glen;
16062-
char* nt = (char*)malloc(new_len + 1);
16063-
if (nt) {
16064-
memcpy(nt, prompt, plen);
16065-
if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
16066-
nt[new_len] = '\0';
16067-
if (*cached_text_io) free(*cached_text_io);
16068-
*cached_text_io = nt;
16114+
if (generated < 0 || accum.tainted) {
16115+
if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
16116+
} else {
16117+
size_t plen = strlen(prompt);
16118+
size_t glen = accum.len;
16119+
size_t new_len = plen + glen;
16120+
char* nt = (char*)malloc(new_len + 1);
16121+
if (nt) {
16122+
memcpy(nt, prompt, plen);
16123+
if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
16124+
nt[new_len] = '\0';
16125+
if (*cached_text_io) free(*cached_text_io);
16126+
*cached_text_io = nt;
16127+
} else {
16128+
/* malloc failed → can't refresh cached_text. Clearing it
16129+
* is safer than leaving the previous (now stale) value. */
16130+
if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
16131+
}
1606916132
}
1607016133
}
1607116134
if (accum.buf) free(accum.buf);

0 commit comments

Comments
 (0)