From f76f4792d3fbe1bcd40664e31b69802e36239cb1 Mon Sep 17 00:00:00 2001
From: quantumaikr <hi@quantumai.kr>
Date: Sun, 12 Apr 2026 10:21:41 +0900
Subject: [PATCH] =?UTF-8?q?fix(chat-cache):=20second=20audit=20pass=20?=
 =?UTF-8?q?=E2=80=94=209=20more=20bugs=20eliminated?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to PR #52. A fresh code-reading audit found another batch of
hidden bugs in the chat KV cache path. None had visible symptoms in the
happy path; all were latent failure modes that would surface under load,
on long histories, or on memory pressure.

Bugs fixed:

CRITICAL
- B1: tq_generate_continue's sliding-window truncation silently desynced
  cached_text. cached_text claimed the FULL prompt was committed, but
  cached_tokens only held the truncated tail — next turn's text-prefix
  match mapped text bytes to the wrong KV positions. Fix: continue now
  returns -2 on overflow instead of truncating.
- B2: cached_text was updated even when generation returned an error,
  leaving the cache claiming committed bytes that weren't.
- B3: chat_accum_callback realloc failure silently dropped tokens AND
  skipped the user's stream callback — broken UX + corrupted cached_text.
  Fix: always pass tokens to user_cb; mark accumulator tainted on
  realloc failure; skip cached_text update if tainted.
- B4: server's get_or_create_session didn't NULL-check tq_create_state_ex.
  An OOM made the next call dereference a NULL kv_state.
- B5: CLI cmd_run interactive loop ignored quant_chat return code, so
  context overflow produced an infinite stream of empty replies. Fix:
  catch ChatContextOverflow, drop oldest turn, retry.

HIGH
- B6: server streaming path only branched on rc == -2; rc == -1 produced
  HTTP 200 with finish_reason "stop" and no error info. Now sends an
  error delta + finish_reason "error".
- B7: server reused an existing session even when the request changed
  kv_type / value_quant_bits — old quantized blocks would be
  misinterpreted. Now detects the change and rebuilds state.
- B8: WASM wasm_load_model didn't reset g_generating. After a
  page-reload mid-stream, every subsequent generate call early-returned
  -1 forever.
- B9: rep_penalty was silently ignored in tq_generate_chat_text's FAST
  path (slow path applied it). Now mirrors the slow path.
- B10: Python Model.chat() ignored the C return value; -2 / -1 surfaced
  as empty token streams. Now raises ChatContextOverflow / RuntimeError.

MEDIUM
- Removed dead `update_cache:` label.
- Synced bindings/python/quant.h (sdist staging snapshot).

Verification:
- ctest --test-dir build → 35/35 passed
- cmake --build build → all targets clean
- wasm/build.sh → quant.wasm rebuilt clean (320K)
- Python imports + cli --help work

quant.h and src/engine/tq_generate.c are kept in lockstep (every
chat-cache change applied to both).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bindings/python/quant.h              | 823 ++++++++++++++++++++++-----
 bindings/python/quantcpp/__init__.py |  38 +-
 bindings/python/quantcpp/cli.py      |  51 +-
 quant.h                              | 107 +++-
 src/engine/tq_generate.c             | 113 +++-
 src/server/tq_server.c               | 138 ++++-
 wasm/quant.wasm                      | Bin 292926 -> 293858 bytes
 wasm/quant_wasm.c                    |   8 +
 8 files changed, 1056 insertions(+), 222 deletions(-)

diff --git a/bindings/python/quant.h b/bindings/python/quant.h
index 49fea77..36cbbb2 100644
--- a/bindings/python/quant.h
+++ b/bindings/python/quant.h
@@ -62,6 +62,13 @@ int quant_generate(quant_ctx* ctx, const char* prompt,
                    void (*on_token)(const char* text, void* user_data),
                    void* user_data);
 
+// Multi-turn chat with KV cache reuse (O(delta) per turn instead of O(n^2)).
+// Subsequent calls only re-prefill the suffix that diverges from history.
+// Pass prompt = NULL to reset the chat session. Returns tokens generated.
+int quant_chat(quant_ctx* ctx, const char* prompt,
+               void (*on_token)(const char* text, void* user_data),
+               void* user_data);
+
 // Generate and return full response as string. Caller must free().
 char* quant_ask(quant_ctx* ctx, const char* prompt);
 
@@ -202,8 +209,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
 // ============================================================================
 
-
-
 /* Cross-language static assert: works in both C11 and C++11/17 */
 #ifdef __cplusplus
 #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +224,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 #define TQ_PI_2 1.5707963267948966f
 #endif
 
-
-
 /* ============================================================
  * Constants
  * ============================================================ */
@@ -398,8 +401,6 @@ typedef struct {
     int      enable_recompression;/* Tier 1 → Tier 2 re-compression   */
 } tq_progressive_config_t;
 
-
-
 /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
  * Block covers TQ_BK elements (128).
@@ -469,12 +470,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
-
-
-
-
-
-
 /* Format specification — version-aware, ONNX-inspired */
 
 #define TQ_SPEC_VERSION 1
@@ -500,18 +495,10 @@ typedef struct {
     uint8_t  flags;            /* TQ_FLAG_* bitmask                 */
 } tq_format_spec_t;
 
-
-
-
-
 // ============================================================================
 // Section 2: Engine Types (from tq_engine.h)
 // ============================================================================
 
-
-
-
-
 /* ============================================================
  * Model configuration
  * ============================================================ */
@@ -886,6 +873,7 @@ typedef struct {
     int n_threads;
     float rep_penalty;    /* repetition penalty (default: 1.1, 1.0 = disabled) */
     int rep_window;       /* how many recent tokens to penalize (default: 32) */
+    unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
     /* Callback for streaming output */
     void (*on_token)(const char* text, void* user_data);
     void* user_data;
@@ -1123,9 +1111,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
 /* Max threads supported by thread pool */
 #define TQ_TP_MAX 16
 
-
-
-
 // ============================================================================
 // Section 3: GGUF Types (from tq_gguf.h)
 // ============================================================================
@@ -1143,10 +1128,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
  * directly into TurboQuant inference engine.
  */
 
-
-
-
-
 /* ============================================================
  * GGUF format constants
  * ============================================================ */
@@ -1462,14 +1443,10 @@ int tq_metal_moe_forward(
     const int*      up_types,       /* per-expert up quant types, NULL = use weight_type */
     const int*      down_types);    /* per-expert down quant types, NULL = use weight_type */
 
-
-
-
 // ============================================================================
 // Section 4: Internal API (from turboquant.h)
 // ============================================================================
 
-
 /**
  * TurboQuant.cpp — Cross-platform KV cache compression library
  *
@@ -1477,9 +1454,6 @@ int tq_metal_moe_forward(
  * Zero external dependencies (libc/libm only).
  */
 
-
-
-
 /* ============================================================
  * Version
  * ============================================================ */
@@ -1753,21 +1727,28 @@ void      tq_progressive_free(tq_progressive_t* p);
 
 tq_progressive_config_t tq_progressive_default_config(void);
 
-
-
-
-
 // ============================================================================
 // Section 5: quant_ctx struct definition
 // ============================================================================
 
-
 struct quant_ctx {
     tq_model_t* model;
     tq_state_t* state;
     tq_tokenizer_t* tokenizer;
     tq_gen_config_t config;
-    int n_ctx_tokens;  /* number of tokens currently in KV cache */
+    int n_ctx_tokens;     /* number of tokens currently in KV cache */
+    /* Prefix-match cache for chat history reuse:
+     * stores the actual token IDs that are committed to the KV cache,
+     * so the next quant_generate() can skip the matching prefix and
+     * only prefill the diverging suffix. Critical for chat mode where
+     * each turn re-sends the entire conversation history. */
+    int* cached_tokens;
+    int  n_cached;
+    int  cached_capacity;
+    /* Text-prefix cache: stores the entire prompt + generated response
+     * text from the last call, allowing the next call to bypass BPE
+     * re-tokenization issues by matching at the byte level. */
+    char* cached_text;
 };
 
 // ============================================================================
@@ -1788,7 +1769,6 @@ struct quant_ctx {
  * - Random signs decorrelate channels across different blocks
  */
 
-
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -1902,7 +1882,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
  */
 /* Generic reference — no compiler-specific pragmas */
 
-
 /* ---------- FP16 helpers ---------- */
 
 static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2264,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
 // Section 8: Type Traits (from tq_traits.c)
 // ============================================================================
 
-
 /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
 static void tq_stub_quantize(const float* src, void* dst, int n) {
     (void)src; (void)dst; (void)n;
@@ -2583,7 +2561,6 @@ tq_type tq_type_from_name(const char* name) {
  * No external dependencies — libc/libm only.
  */
 
-
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -2617,7 +2594,6 @@ static struct {
 
 static int g_n_threads = 1;
 
-
 static void* tp_worker(void* arg) {
     int id = (int)(intptr_t)arg;
     int my_gen = 0;
@@ -4173,6 +4149,7 @@ tq_gen_config_t tq_default_gen_config(void) {
     config.n_threads = 1;
     config.rep_penalty = 1.1f;
     config.rep_window = 32;
+    config.rng_seed = 42ULL;
     config.on_token = NULL;
     config.user_data = NULL;
     return config;
@@ -4388,8 +4365,6 @@ void tq_matmul_1bit(float* out, const float* x,
  * SPDX-License-Identifier: MIT
  */
 
-
-
 #ifdef _WIN32
 #else
 #endif
@@ -5098,8 +5073,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
  * Pure C11, no external dependencies.
  */
 
-
-
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include <arm_neon.h>
 #define TQ_HAS_NEON 1
@@ -7174,7 +7147,6 @@ void tq_metal_batch_end_if_available(void) {
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
  */
 
-
 /* Global for qsort comparator (vocab index sorting) */
 static char** g_vocab_for_sort;
 static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8033,32 +8005,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
         }
     }
 
-    /* Load merges if available */
+    /* Build sorted indices BEFORE merge parsing so str_lookup() can use
+     * binary search instead of O(n) linear scan.  For 248K vocab with
+     * ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
+    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
+    if (tok->sorted_indices) {
+        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
+        g_vocab_for_sort = tok->vocab;
+        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
+    }
+
+    /* Load and parse merges if available.
+     * GGUF stores merges as a string array of "tok_a tok_b" pairs.
+     * We need to look up token IDs and build (id_a, id_b, id_merged) triples
+     * so the BPE encoder can use them. */
     int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
     if (merges_idx >= 0) {
         const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
         if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
             mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
-            /* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
-            uint64_t n_merges = mkv->value.array.count;
-            tok->n_merges = (int)n_merges;
-            tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
+            uint64_t n_merges_total = mkv->value.array.count;
+            tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
+            tok->n_merges = 0;
             if (tok->merge_pairs) {
-                memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
+                tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
+                for (uint64_t mi = 0; mi < n_merges_total; mi++) {
+                    if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
+
+                    /* Copy merge string and split on space: "tok_a tok_b" */
+                    char buf[2048];
+                    int slen = (int)merge_strings[mi].len;
+                    if (slen >= (int)sizeof(buf)) continue;
+                    memcpy(buf, merge_strings[mi].str, (size_t)slen);
+                    buf[slen] = '\0';
+
+                    char* sep = strchr(buf, ' ');
+                    if (!sep) continue;
+                    *sep = '\0';
+                    const char* str_a = buf;
+                    const char* str_b = sep + 1;
+
+                    /* Build merged string: concatenation of tok_a + tok_b */
+                    char merged[2048];
+                    int la = (int)strlen(str_a);
+                    int lb = (int)strlen(str_b);
+                    if (la + lb >= (int)sizeof(merged)) continue;
+                    memcpy(merged, str_a, (size_t)la);
+                    memcpy(merged + la, str_b, (size_t)lb);
+                    merged[la + lb] = '\0';
+
+                    /* Look up token IDs via binary search (sorted_indices built above) */
+                    int id_a = str_lookup(tok, str_a);
+                    int id_b = str_lookup(tok, str_b);
+                    int id_merged = str_lookup(tok, merged);
+
+                    if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
+                        tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
+                        tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
+                        tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
+                        /* Priority: earlier merges in GGUF = higher priority */
+                        tok->scores[id_merged] = (float)(n_merges_total - mi);
+                        tok->n_merges++;
+                    }
+                }
+                fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
+                        tok->n_merges, (int)n_merges_total);
             }
         }
     }
 
-    /* Build sorted indices for encoding (binary search by string).
-     * Use qsort for O(n log n) instead of insertion sort O(n²) — critical
-     * for 248K vocab where insertion sort would take minutes. */
-    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
-    if (tok->sorted_indices) {
-        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
-        g_vocab_for_sort = tok->vocab;
-        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
-    }
-
     fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
             tok->vocab_size, tok->max_token_len);
     return tok;
@@ -8476,7 +8491,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
  */
 
-
 #ifdef _WIN32
 #else
 #endif
@@ -9939,18 +9953,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
 
     free(tensors);
 
-    /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
-     * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
-     * We bake the "+1" into the weight so tq_rmsnorm can stay as
-     * out = x * rsqrt * weight.
-     *
-     * This applies to: input_layernorm, post_attention_layernorm,
-     * model.norm, q_norm, k_norm.
-     * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
-     * uses plain weight without +1).
-     *
-     * We detect Qwen3.5 by the presence of DeltaNet layers. */
-    if (model->config.delta_n_heads > 0) {
+    /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
+     * Only for non-GGUF models (raw checkpoints). GGUF files from
+     * llama.cpp already have +1 baked in by the converter.
+     * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
+    if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
 
@@ -9979,7 +9986,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
             for (int i = 0; i < dim_h; i++)
                 model->output_norm[i] += 1.0f;
         }
-        fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
+        fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
     }
 
     /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
@@ -12143,8 +12150,13 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
 
         const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
-        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
+        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
+         * Can be set via environment variable or compile-time define (useful for WASM). */
+#ifdef TQ_NO_Q4
+        if (1) {
+#else
         if (getenv("TQ_NO_Q4")) {
+#endif
             fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
             goto skip_q4_conversion;
         }
@@ -12893,7 +12905,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
  *   -> residual add
  */
 
-
 /* Unified Q2/1-bit matmul dispatch.
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15153,7 +15164,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
     }
 
-
     /* Increment profile token count if profiling is active */
     if (s->profile_kv) {
         s->profile_kv_count++;
@@ -15204,7 +15214,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
  *   - Full generation loop with streaming callback
  */
 
-
 /* ============================================================
  * Argmax sampling: return token with highest logit
  * ============================================================ */
@@ -15425,7 +15434,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         fprintf(stderr, "\n");
     }
 
-    /* Prefill: process all prompt tokens */
+    /* Prefill: process all prompt tokens.
+     * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
+     * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
+     * sleep here breaks ASYNCIFY for the entire generate call, including
+     * the token streaming callback. The browser shows "Thinking..." via
+     * requestAnimationFrame before entering this blocking prefill. */
     for (int i = 0; i < n_prompt; i++) {
         tq_forward(model, state, prompt_tokens[i], i);
     }
@@ -15460,9 +15474,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         }
     }
 
-    /* Sample first generated token */
+    /* Sample first generated token. The seed is configurable via
+     * config->rng_seed (default 42); 0 falls back to 42 so existing
+     * callers that never set rng_seed get bit-identical behaviour. */
     int pos = n_prompt;
-    unsigned long long rng_state = 42;
+    unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
     int next_token = tq_sample_topp(state->logits, vocab_size,
                                      config->temperature, config->top_p,
                                      &rng_state);
@@ -15627,6 +15643,498 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     return generated;
 }
 
+/* ============================================================================
+ * tq_generate_continue — reuse an existing tq_state_t across calls.
+ *
+ * Unlike tq_generate (which allocates and frees its own state on every call),
+ * this function takes a caller-managed state plus a record of which tokens
+ * are currently committed to the KV cache. It computes the longest common
+ * prefix between the cached tokens and the new prompt, then prefills only
+ * the diverging suffix. After generation, *cached_tokens_out and
+ * *n_cached_out are updated to reflect the new cache contents.
+ *
+ * This turns chat mode from O(n^2) (full re-prefill every turn) into
+ * O(delta) (only the new tokens of each turn).
+ *
+ * Returns the number of tokens generated, or -1 on error.
+ * ============================================================================ */
+static int tq_lcp_int(const int* a, int na, const int* b, int nb) {
+    int lim = na < nb ? na : nb;
+    int i = 0;
+    while (i < lim && a[i] == b[i]) i++;
+    return i;
+}
+
+int tq_generate_continue(tq_model_t* model,
+                          tq_tokenizer_t* tokenizer,
+                          tq_state_t* state,
+                          const char* prompt,
+                          tq_gen_config_t* config,
+                          int** cached_tokens_io,   /* in/out: cached prefix tokens */
+                          int*  n_cached_io,        /* in/out: cached count */
+                          int*  cached_capacity_io, /* in/out: allocated capacity */
+                          char* output, int output_size) {
+    if (!model || !state || !config || !cached_tokens_io || !n_cached_io || !cached_capacity_io) {
+        return -1;
+    }
+
+    /* Heap-allocated prompt token buffer (was a 4096-stack array, which
+     * silently truncated after ~10 turns of accumulating chat history).
+     * Cap at the model's max_seq_len so we never exceed KV bounds. */
+    int max_prompt = model->config.max_seq_len > 0
+                       ? model->config.max_seq_len : 4096;
+    int* new_tokens = (int*)malloc((size_t)max_prompt * sizeof(int));
+    if (!new_tokens) return -1;
+    int n_new = 0;
+    if (tokenizer && prompt) {
+        int add_bos = (model->config.model_type == 1) ? 1 : 0;
+        n_new = tq_encode(tokenizer, prompt, new_tokens, max_prompt, add_bos);
+    }
+    if (n_new <= 0) {
+        new_tokens[0] = (model->config.model_type == 1) ? 2 : 1;
+        n_new = 1;
+    }
+
+    /* Overflow check: reject prompts that won't fit. The previous
+     * behavior was to silently drop oldest tokens via a sliding window,
+     * but that desynced any cached_text the higher-level wrapper held
+     * (cached_text claimed the full prompt, while cached_tokens only
+     * had the truncated tail — next turn's text-prefix match would
+     * map text bytes to the wrong KV positions). Returning -2 lets the
+     * caller decide (reset chat, show error). */
+    int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
+    int budget  = max_prompt - reserve - 32;
+    if (budget < 64) budget = 64;
+    if (n_new > budget) {
+        free(new_tokens);
+        if (getenv("TQ_CHAT_DEBUG")) {
+            fprintf(stderr, "[chat] OVERFLOW n_new=%d budget=%d max=%d\n",
+                    n_new, budget, max_prompt);
+        }
+        return -2;
+    }
+
+    /* Find longest common prefix with the cached tokens.
+     * If the new prompt is just an extension of the cached one, we skip
+     * everything up to the LCP and only prefill the suffix. */
+    int n_cached = *n_cached_io;
+    int* cached_tokens = *cached_tokens_io;
+
+    int lcp = tq_lcp_int(cached_tokens, n_cached, new_tokens, n_new);
+
+    /* Prefill the new suffix [lcp, n_new) */
+    for (int i = lcp; i < n_new; i++) {
+        tq_forward(model, state, new_tokens[i], i);
+    }
+    int pos = n_new;
+    int prefill_tokens = n_new - lcp;
+    int prefix_hit    = lcp;
+
+    /* Save the n_new prompt into the cache buffer (will append generated
+     * tokens below). Grow the buffer if needed. */
+    int needed_cap = n_new + config->max_tokens + 16;
+    if (*cached_capacity_io < needed_cap) {
+        int new_cap = needed_cap < 4096 ? 4096 : needed_cap;
+        int* nb = (int*)realloc(*cached_tokens_io, (size_t)new_cap * sizeof(int));
+        if (!nb) { free(new_tokens); return -1; }
+        *cached_tokens_io = nb;
+        *cached_capacity_io = new_cap;
+        cached_tokens = nb;
+    }
+    memcpy(cached_tokens, new_tokens, (size_t)n_new * sizeof(int));
+    *n_cached_io = n_new;
+    n_cached = n_new;
+
+    /* --- generation loop (mirrors tq_generate's loop) --- */
+    int vocab_size = model->config.vocab_size;
+    float rep_penalty = config->rep_penalty;
+    int rep_window = config->rep_window;
+    if (rep_window > 64) rep_window = 64;
+    int recent_tokens[64];
+    int recent_count = 0;
+    for (int i = (n_new > rep_window ? n_new - rep_window : 0); i < n_new; i++) {
+        recent_tokens[recent_count % 64] = new_tokens[i];
+        recent_count++;
+    }
+
+    if (rep_penalty > 1.0f) {
+        int window = recent_count < rep_window ? recent_count : rep_window;
+        for (int r = 0; r < window; r++) {
+            int idx = (recent_count - 1 - r) % 64;
+            if (idx < 0) idx += 64;
+            int tok = recent_tokens[idx];
+            if (tok >= 0 && tok < vocab_size && state->logits) {
+                if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                else                         state->logits[tok] *= rep_penalty;
+            }
+        }
+    }
+
+    uint64_t rng_state = config->rng_seed ? (uint64_t)config->rng_seed
+                                          : (uint64_t)time(NULL);
+    int next_token = tq_sample_topp(state->logits, vocab_size,
+                                     config->temperature, config->top_p,
+                                     &rng_state);
+
+    int generated = 0;
+    int output_pos = 0;
+    int prev_token = new_tokens[n_new - 1];
+
+    int eos_tokens[] = {
+        1, 2, 106, 128001, 128006, 128007, 128008, 128009, 248044, 248046,
+    };
+    int n_eos = sizeof(eos_tokens) / sizeof(eos_tokens[0]);
+
+    while (generated < config->max_tokens) {
+        int is_eos = 0;
+        for (int e = 0; e < n_eos; e++) {
+            if (next_token == eos_tokens[e]) { is_eos = 1; break; }
+        }
+        if (is_eos) break;
+
+        if (pos >= model->config.max_seq_len) break;  /* simple stop, no shift */
+
+        /* Decode + stream */
+        if (tokenizer) {
+            const char* piece = tq_decode(tokenizer, prev_token, next_token);
+            int should_stop = 0;
+            if (piece) {
+                if (strstr(piece, "<|im_end|>") || strstr(piece, "<|eot_id|>") ||
+                    strstr(piece, "<|start_header_id|>")) {
+                    should_stop = 1; piece = "";
+                }
+            }
+            if (should_stop) break;
+            int piece_len = (int)strlen(piece ? piece : "");
+            if (config->on_token && piece) config->on_token(piece, config->user_data);
+            if (output && piece && output_pos + piece_len < output_size - 1) {
+                memcpy(output + output_pos, piece, piece_len);
+                output_pos += piece_len;
+            }
+        }
+
+        /* Append generated token to cache record */
+        if (n_cached < *cached_capacity_io) {
+            cached_tokens[n_cached++] = next_token;
+            *n_cached_io = n_cached;
+        }
+
+        prev_token = next_token;
+        tq_forward(model, state, next_token, pos);
+        pos++;
+        generated++;
+
+        if (rep_penalty > 1.0f) {
+            int window = recent_count < rep_window ? recent_count : rep_window;
+            for (int r = 0; r < window; r++) {
+                int idx = (recent_count - 1 - r) % 64;
+                if (idx < 0) idx += 64;
+                int tok = recent_tokens[idx];
+                if (tok >= 0 && tok < vocab_size) {
+                    if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                    else                         state->logits[tok] *= rep_penalty;
+                }
+            }
+        }
+
+        next_token = tq_sample_topp(state->logits, vocab_size,
+                                     config->temperature, config->top_p,
+                                     &rng_state);
+        recent_tokens[recent_count % 64] = next_token;
+        recent_count++;
+    }
+
+    if (output && output_size > 0) {
+        output[output_pos < output_size ? output_pos : output_size - 1] = '\0';
+    }
+
+    if (getenv("TQ_CHAT_DEBUG")) {
+        fprintf(stderr,
+            "[chat] prefix_hit=%d prefill=%d generated=%d cached=%d\n",
+            prefix_hit, prefill_tokens, generated, *n_cached_io);
+    }
+
+    free(new_tokens);
+    return generated;
+}
+
+/* ============================================================================
+ * tq_generate_chat_text — text-prefix matching for chat reuse
+ *
+ * Solves the BPE re-tokenization issue: when the model generates response
+ * tokens via sample_topp, those token IDs may not match what tq_encode()
+ * produces from the same response text in the next turn's prompt. The
+ * token-level LCP in tq_generate_continue truncates at that boundary.
+ *
+ * This function tracks the *text* of the last prompt+response. On the next
+ * call, if the new prompt starts with cached_text byte-for-byte, the entire
+ * cached state is valid — tokenize ONLY the new SUFFIX text and prefill
+ * those tokens at positions [n_cached..]. No LCP, no truncation.
+ *
+ * Pass cached_text_io == NULL to disable text-prefix tracking.
+ * ============================================================================ */
+
+typedef struct {
+    char*  buf;
+    size_t len;
+    size_t cap;
+    int    tainted;   /* 1 if accumulation ever failed → buf is incomplete */
+    void (*user_cb)(const char*, void*);
+    void*  user_data;
+} chat_accum_t;
+
+static void chat_accum_callback(const char* tok, void* u) {
+    chat_accum_t* ctx = (chat_accum_t*)u;
+    if (!tok) return;
+    /* Always pass through to the user's callback first — losing tokens
+     * from the user's stream because of an INTERNAL realloc failure is
+     * far worse than a stale cached_text on the next turn. */
+    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+    if (ctx->tainted) return;
+    size_t tlen = strlen(tok);
+    if (ctx->len + tlen + 1 > ctx->cap) {
+        size_t new_cap = (ctx->cap + tlen + 64) * 2;
+        char* nb = (char*)realloc(ctx->buf, new_cap);
+        if (!nb) { ctx->tainted = 1; return; }
+        ctx->buf = nb;
+        ctx->cap = new_cap;
+    }
+    memcpy(ctx->buf + ctx->len, tok, tlen);
+    ctx->len += tlen;
+    ctx->buf[ctx->len] = '\0';
+}
+
+int tq_generate_chat_text(tq_model_t* model,
+                           tq_tokenizer_t* tokenizer,
+                           tq_state_t* state,
+                           const char* prompt,
+                           tq_gen_config_t* config,
+                           char** cached_text_io,
+                           int** cached_tokens_io,
+                           int*  n_cached_io,
+                           int*  cached_capacity_io,
+                           char* output, int output_size) {
+    if (!model || !state || !config || !cached_tokens_io || !n_cached_io || !cached_capacity_io || !prompt) {
+        return -1;
+    }
+
+    int matched_text_len = 0;
+    int prefix_pos = 0;
+
+    if (cached_text_io && *cached_text_io && *n_cached_io > 0) {
+        size_t cached_len = strlen(*cached_text_io);
+        if (cached_len > 0 && strncmp(*cached_text_io, prompt, cached_len) == 0) {
+            matched_text_len = (int)cached_len;
+            prefix_pos = *n_cached_io;
+        }
+    }
+
+    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
+                            .user_cb = config->on_token,
+                            .user_data = config->user_data };
+    void (*orig_cb)(const char*, void*) = config->on_token;
+    void*  orig_ud = config->user_data;
+    config->on_token = chat_accum_callback;
+    config->user_data = &accum;
+
+    int generated = 0;
+
+    if (matched_text_len > 0) {
+        const char* suffix = prompt + matched_text_len;
+        int max_prompt = model->config.max_seq_len > 0
+                           ? model->config.max_seq_len : 4096;
+        int* suffix_toks = (int*)malloc((size_t)max_prompt * sizeof(int));
+        if (!suffix_toks) {
+            config->on_token = orig_cb; config->user_data = orig_ud;
+            return -1;
+        }
+        int n_suffix = 0;
+        if (*suffix != '\0') {
+            n_suffix = tq_encode(tokenizer, suffix, suffix_toks, max_prompt, 0);
+            if (n_suffix < 0) n_suffix = 0;
+        }
+
+        /* Context overflow: return -2 instead of falling back to a
+         * dangerous full reprefill. The state still has stale KV at
+         * positions [n_new..prefix_pos) that would corrupt later tokens.
+         * Caller should reset the chat and retry. */
+        int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
+        if (prefix_pos + n_suffix + reserve + 32 > max_prompt) {
+            free(suffix_toks);
+            config->on_token = orig_cb; config->user_data = orig_ud;
+            if (accum.buf) free(accum.buf);
+            if (getenv("TQ_CHAT_DEBUG")) {
+                fprintf(stderr,
+                    "[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n",
+                    prefix_pos, n_suffix, reserve, max_prompt);
+            }
+            return -2;
+        }
+
+        int needed = prefix_pos + n_suffix + reserve + 16;
+        if (*cached_capacity_io < needed) {
+            int new_cap = needed < 4096 ? 4096 : needed;
+            int* nb = (int*)realloc(*cached_tokens_io, (size_t)new_cap * sizeof(int));
+            if (!nb) { free(suffix_toks); config->on_token = orig_cb; config->user_data = orig_ud; return -1; }
+            *cached_tokens_io = nb;
+            *cached_capacity_io = new_cap;
+        }
+
+        int* cached = *cached_tokens_io;
+        for (int i = 0; i < n_suffix; i++) {
+            cached[prefix_pos + i] = suffix_toks[i];
+            tq_forward(model, state, suffix_toks[i], prefix_pos + i);
+        }
+        *n_cached_io = prefix_pos + n_suffix;
+        free(suffix_toks);
+
+        if (getenv("TQ_CHAT_DEBUG")) {
+            fprintf(stderr, "[chat-text] FAST text_match=%d new_suffix_tokens=%d\n",
+                    matched_text_len, n_suffix);
+        }
+
+        /* Generation loop — mirrors tq_generate_continue including
+         * rep_penalty (which the fast path was silently dropping). */
+        int vocab_size = model->config.vocab_size;
+        int n_cached = *n_cached_io;
+        int pos = n_cached;
+        int prev_token = n_cached > 0 ? cached[n_cached - 1] : 1;
+
+        float rep_penalty = config->rep_penalty;
+        int rep_window = config->rep_window;
+        if (rep_window > 64) rep_window = 64;
+        int recent_tokens[64];
+        int recent_count = 0;
+        for (int i = (n_cached > rep_window ? n_cached - rep_window : 0); i < n_cached; i++) {
+            recent_tokens[recent_count % 64] = cached[i];
+            recent_count++;
+        }
+        if (rep_penalty > 1.0f) {
+            int window = recent_count < rep_window ? recent_count : rep_window;
+            for (int r = 0; r < window; r++) {
+                int idx = (recent_count - 1 - r) % 64;
+                if (idx < 0) idx += 64;
+                int tok = recent_tokens[idx];
+                if (tok >= 0 && tok < vocab_size && state->logits) {
+                    if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                    else                         state->logits[tok] *= rep_penalty;
+                }
+            }
+        }
+
+        uint64_t rng_state = config->rng_seed
+            ? (uint64_t)config->rng_seed : (uint64_t)time(NULL);
+        int next_token = tq_sample_topp(state->logits, vocab_size,
+                                         config->temperature, config->top_p,
+                                         &rng_state);
+
+        int output_pos = 0;
+        int eos_tokens[] = { 1, 2, 106, 128001, 128006, 128007, 128008, 128009, 248044, 248046 };
+        int n_eos = sizeof(eos_tokens) / sizeof(eos_tokens[0]);
+
+        while (generated < config->max_tokens) {
+            int is_eos = 0;
+            for (int e = 0; e < n_eos; e++) {
+                if (next_token == eos_tokens[e]) { is_eos = 1; break; }
+            }
+            if (is_eos) break;
+            if (pos >= model->config.max_seq_len) break;
+
+            const char* piece = tokenizer ? tq_decode(tokenizer, prev_token, next_token) : "";
+            int should_stop = 0;
+            if (piece) {
+                if (strstr(piece, "<|im_end|>") || strstr(piece, "<|eot_id|>") ||
+                    strstr(piece, "<|start_header_id|>")) {
+                    should_stop = 1; piece = "";
+                }
+            }
+            if (should_stop) break;
+
+            int piece_len = (int)strlen(piece ? piece : "");
+            if (config->on_token && piece) config->on_token(piece, config->user_data);
+            if (output && piece && output_pos + piece_len < output_size - 1) {
+                memcpy(output + output_pos, piece, piece_len);
+                output_pos += piece_len;
+            }
+
+            if (n_cached < *cached_capacity_io) {
+                cached[n_cached++] = next_token;
+                *n_cached_io = n_cached;
+            }
+
+            prev_token = next_token;
+            tq_forward(model, state, next_token, pos);
+            pos++;
+            generated++;
+
+            if (rep_penalty > 1.0f) {
+                int window = recent_count < rep_window ? recent_count : rep_window;
+                for (int r = 0; r < window; r++) {
+                    int idx = (recent_count - 1 - r) % 64;
+                    if (idx < 0) idx += 64;
+                    int tok = recent_tokens[idx];
+                    if (tok >= 0 && tok < vocab_size) {
+                        if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                        else                         state->logits[tok] *= rep_penalty;
+                    }
+                }
+            }
+
+            next_token = tq_sample_topp(state->logits, vocab_size,
+                                         config->temperature, config->top_p,
+                                         &rng_state);
+            recent_tokens[recent_count % 64] = next_token;
+            recent_count++;
+        }
+
+        if (output && output_size > 0) {
+            output[output_pos < output_size ? output_pos : output_size - 1] = '\0';
+        }
+    } else {
+        if (getenv("TQ_CHAT_DEBUG")) {
+            fprintf(stderr, "[chat-text] SLOW no text-prefix match, full tokenize\n");
+        }
+        generated = tq_generate_continue(
+            model, tokenizer, state, prompt, config,
+            cached_tokens_io, n_cached_io, cached_capacity_io,
+            output, output_size);
+    }
+
+    config->on_token = orig_cb;
+    config->user_data = orig_ud;
+
+    /* Update cached_text only if we know the KV state corresponds
+     * EXACTLY to (prompt + accum.buf):
+     *   - generated >= 0: generation didn't error out
+     *   - !accum.tainted: every generated token was captured
+     * On any failure, clear cached_text so the next call falls through
+     * to the slow path with a clean slate instead of trusting bytes
+     * that don't match the KV cache. */
+    if (cached_text_io) {
+        if (generated < 0 || accum.tainted) {
+            if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+        } else {
+            size_t plen = strlen(prompt);
+            size_t glen = accum.len;
+            size_t new_len = plen + glen;
+            char* nt = (char*)malloc(new_len + 1);
+            if (nt) {
+                memcpy(nt, prompt, plen);
+                if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
+                nt[new_len] = '\0';
+                if (*cached_text_io) free(*cached_text_io);
+                *cached_text_io = nt;
+            } else {
+                /* malloc failed → can't refresh cached_text. Clearing it
+                 * is safer than leaving the previous (now stale) value. */
+                if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+            }
+        }
+    }
+    if (accum.buf) free(accum.buf);
+
+    return generated;
+}
 
 // ============================================================================
 
@@ -15862,22 +16370,7 @@ void quant_free_string(char* str) {
     if (str) free(str);
 }
 
-/* ================================================================
- * Context persistence — save/load KV cache to disk
- *
- * File format (binary, little-endian):
- *   magic:     4 bytes "QKVC"
- *   version:   uint32 (1)
- *   n_layers:  uint32
- *   kv_dim:    uint32 (n_kv_heads * head_dim)
- *   max_seq:   uint32
- *   n_tokens:  uint32 (number of filled positions)
- *   kv_type:   uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
- *   has_fp16v: uint32 (1 if value_cache_fp16 is used)
- *   reserved:  32 bytes (future use)
- *   data:      raw KV cache bytes
- * ================================================================ */
-
+/* Context persistence: QKVC format (64-byte header + raw KV data) */
 int quant_save_context(quant_ctx* ctx, const char* path) {
     if (!ctx || !ctx->state || !path) return -1;
     FILE* fp = fopen(path, "wb");
@@ -15886,29 +16379,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-
-    /* Header */
     fwrite("QKVC", 1, 4, fp);
-    uint32_t version = 1;
-    uint32_t nl = (uint32_t)c->n_layers;
-    uint32_t kd = (uint32_t)kv_dim;
-    uint32_t ms = (uint32_t)c->max_seq_len;
-    uint32_t nt = (uint32_t)ctx->n_ctx_tokens;
-    uint32_t kt = (uint32_t)s->kv_quant_type;
-    uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
-    fwrite(&version, 4, 1, fp);
-    fwrite(&nl, 4, 1, fp);
-    fwrite(&kd, 4, 1, fp);
-    fwrite(&ms, 4, 1, fp);
-    fwrite(&nt, 4, 1, fp);
-    fwrite(&kt, 4, 1, fp);
-    fwrite(&hfp16, 4, 1, fp);
-    char reserved[32] = {0};
-    fwrite(reserved, 1, 32, fp);
+    uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
+        (uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
+        (uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
+    fwrite(hdr, 4, 7, fp);
+    char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
 
     /* KV data: write only the filled portion (nt tokens) */
     for (uint32_t l = 0; l < nl; l++) {
-        size_t layer_stride = (size_t)ms * kv_dim;
+        size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
         /* Key cache: FP32 or quantized */
         if (s->key_cache) {
             fwrite(s->key_cache + l * layer_stride, sizeof(float),
@@ -15916,7 +16397,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
         }
         if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
             size_t blk_sz = tq_type_type_size(kt);
-            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)ms * blk_sz;
+            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
             fwrite(qbase, blk_sz, nt, fp);
         }
         /* Value cache: FP32 or FP16 */
@@ -15925,7 +16406,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
                    (size_t)nt * kv_dim, fp);
         }
         if (s->value_cache_fp16) {
-            size_t layer_stride16 = (size_t)ms * kv_dim;
+            size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
             fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
                    (size_t)nt * kv_dim, fp);
         }
@@ -15942,37 +16423,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
     FILE* fp = fopen(path, "rb");
     if (!fp) return -1;
 
-    /* Read and validate header */
     char magic[4];
-    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
-        fclose(fp); return -1;
-    }
-    uint32_t version, nl, kd, ms, nt, kt, hfp16;
-    fread(&version, 4, 1, fp);
-    fread(&nl, 4, 1, fp);
-    fread(&kd, 4, 1, fp);
-    fread(&ms, 4, 1, fp);
-    fread(&nt, 4, 1, fp);
-    fread(&kt, 4, 1, fp);
-    fread(&hfp16, 4, 1, fp);
-    char reserved[32];
-    fread(reserved, 1, 32, fp);
-
+    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
+    uint32_t hdr[7]; fread(hdr, 4, 7, fp);
+    char reserved[32]; fread(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-
-    /* Validate compatibility */
-    if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
-        fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
-                nl, c->n_layers, kd, kv_dim);
-        fclose(fp); return -1;
-    }
-    if (nt > (uint32_t)c->max_seq_len) {
-        fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
-                nt, c->max_seq_len);
-        fclose(fp); return -1;
-    }
+    if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
+    if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
 
     /* Read KV data */
     for (uint32_t l = 0; l < nl; l++) {
@@ -16009,9 +16469,70 @@ void quant_free_ctx(quant_ctx* ctx) {
     if (!ctx) return;
     tq_free_state(ctx->state);
     tq_free_tokenizer(ctx->tokenizer);
+    if (ctx->cached_tokens) free(ctx->cached_tokens);
+    if (ctx->cached_text) free(ctx->cached_text);
     free(ctx);
 }
 
+/* ----------------------------------------------------------------------
+ * quant_chat — chat-mode generate that reuses the KV cache across calls.
+ *
+ * Unlike quant_generate (which resets the state on every call and so makes
+ * each turn O(history_length)), quant_chat keeps the state alive between
+ * calls. The first call to quant_chat() prefills and generates as normal.
+ * Subsequent calls compute the longest common prefix between the new prompt
+ * and the previously processed tokens, skip the matched prefix, and only
+ * prefill the diverging suffix.
+ *
+ * Result: turn N's prefill cost is O(new tokens this turn), not
+ * O(total history). Chat experience matches what users expect from ollama.
+ *
+ * Reset behavior: pass NULL prompt to wipe the cache (start a new chat).
+ * Returns the number of tokens generated, or -1 on error.
+ * ---------------------------------------------------------------------- */
+int quant_chat(quant_ctx* ctx, const char* prompt,
+               void (*on_token)(const char* text, void* user_data),
+               void* user_data) {
+    if (!ctx || !ctx->model) return -1;
+
+    /* NULL prompt = reset the chat (clear cache + state) */
+    if (!prompt) {
+        tq_free_state(ctx->state);
+        ctx->state = tq_create_state_ex(&ctx->model->config,
+                                         ctx->config.kv_type,
+                                         ctx->config.value_quant_bits);
+        if (ctx->cached_tokens) free(ctx->cached_tokens);
+        ctx->cached_tokens = NULL;
+        ctx->n_cached = 0;
+        ctx->cached_capacity = 0;
+        ctx->n_ctx_tokens = 0;
+        if (ctx->cached_text) { free(ctx->cached_text); ctx->cached_text = NULL; }
+        return 0;
+    }
+
+    if (!ctx->state) {
+        ctx->state = tq_create_state_ex(&ctx->model->config,
+                                         ctx->config.kv_type,
+                                         ctx->config.value_quant_bits);
+        if (!ctx->state) return -1;
+    }
+
+    ctx->config.on_token = on_token;
+    ctx->config.user_data = user_data;
+
+    char output[65536];
+    /* Use the text-prefix path so chat replays bypass BPE re-tokenization
+     * issues. Falls back to token-LCP path if text prefix doesn't match. */
+    int n = tq_generate_chat_text(
+        ctx->model, ctx->tokenizer, ctx->state, prompt, &ctx->config,
+        &ctx->cached_text,
+        &ctx->cached_tokens, &ctx->n_cached, &ctx->cached_capacity,
+        output, sizeof(output));
+
+    if (n > 0) ctx->n_ctx_tokens = ctx->n_cached;
+    return n;
+}
+
 void quant_free_model(quant_model* model) {
     tq_free_model((tq_model_t*)model);
 }
diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index 88bdb6e..e9559ef 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -35,6 +35,16 @@
 )
 
 
+class ChatContextOverflow(RuntimeError):
+    """Raised when chat history exceeds the model's context window.
+
+    The C side has already auto-reset the session by the time this is
+    raised — the caller must trim its conversation history (drop the
+    oldest turns) and retry. Catching this is the supported way to
+    detect "we hit max_seq_len" without parsing log output.
+    """
+
+
 # -----------------------------------------------------------------------
 # Model registry — small GGUF models auto-downloaded from HuggingFace
 # -----------------------------------------------------------------------
@@ -394,6 +404,15 @@ def chat(self, prompt: str) -> Iterator[str]:
 
         Falls back to ``generate()`` on older library builds without
         ``quant_chat`` symbol.
+
+        Raises
+        ------
+        ChatContextOverflow
+            When the conversation history exceeds the model's context
+            window. The session has been auto-reset; the caller should
+            trim history and retry.
+        RuntimeError
+            On other generation failures (allocation, invalid state).
         """
         self._ensure_open()
         lib = get_lib()
@@ -414,6 +433,7 @@ def chat(self, prompt: str) -> Iterator[str]:
         tokens = []
         done = threading.Event()
         error_box = [None]
+        rc_box = [0]
 
         def _on_token(text_ptr, _user_data):
             if text_ptr:
@@ -424,7 +444,8 @@ def _on_token(text_ptr, _user_data):
         def _run():
             try:
                 with self._lock:
-                    lib.quant_chat(self._ctx, prompt.encode("utf-8"), cb, None)
+                    rc_box[0] = lib.quant_chat(
+                        self._ctx, prompt.encode("utf-8"), cb, None)
             except Exception as e:
                 error_box[0] = e
             finally:
@@ -448,6 +469,19 @@ def _run():
         if error_box[0] is not None:
             raise error_box[0]
 
+        # Surface generation failures from the C side. Previously these
+        # were silently swallowed: -2 (context overflow) and -1 (alloc
+        # failure) both produced empty token streams that callers could
+        # not distinguish from "the model decided to say nothing".
+        rc = rc_box[0]
+        if rc == -2:
+            raise ChatContextOverflow(
+                "conversation history exceeds the model's context window — "
+                "session has been reset, retry with shorter history"
+            )
+        if rc < 0:
+            raise RuntimeError(f"quant_chat failed with rc={rc}")
+
     def reset_chat(self) -> None:
         """Reset the chat KV cache. Next chat() call starts fresh."""
         self._ensure_open()
@@ -528,4 +562,4 @@ def load(path: str, **kwargs) -> Model:
     return Model(path, **kwargs)
 
 
-__all__ = ["Model", "load", "download", "__version__"]
+__all__ = ["Model", "load", "download", "ChatContextOverflow", "__version__"]
diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
index 954d7fc..830204f 100644
--- a/bindings/python/quantcpp/cli.py
+++ b/bindings/python/quantcpp/cli.py
@@ -151,24 +151,63 @@ def cmd_run(args):
             print(tok, end="", flush=True)
         print()
     else:
+        from quantcpp import ChatContextOverflow
         print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
         # Multi-turn chat: accumulate history as ChatML so the model sees
         # prior turns. m.chat() reuses the KV cache via prefix-match, so
         # repeating the history is cheap (O(new tokens), not O(n^2)).
-        history = ""
+        # turns is a list of (user_msg, assistant_msg) pairs so we can
+        # trim from the front when we hit context overflow.
+        turns = []
+        def _build_history(extra_user=None):
+            parts = []
+            for u, a in turns:
+                parts.append(f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n")
+            if extra_user is not None:
+                parts.append(f"<|im_start|>user\n{extra_user}<|im_end|>\n<|im_start|>assistant\n")
+            return "".join(parts)
+
         try:
             while True:
                 question = input("\nYou: ")
                 if not question.strip():
                     continue
-                history += f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
                 print("AI: ", end="", flush=True)
                 reply_buf = []
-                for tok in m.chat(history):
-                    print(tok, end="", flush=True)
-                    reply_buf.append(tok)
+                # Retry loop: on context overflow, drop the oldest turn
+                # and try again. Without this, the C side resets the KV
+                # cache but Python's history still has the bloat, so
+                # every subsequent turn would loop back into overflow.
+                attempt = 0
+                while True:
+                    history = _build_history(extra_user=question)
+                    try:
+                        for tok in m.chat(history):
+                            print(tok, end="", flush=True)
+                            reply_buf.append(tok)
+                        break
+                    except ChatContextOverflow:
+                        if not turns:
+                            print("\n[chat] message alone exceeds context window — try a shorter question.",
+                                  file=sys.stderr)
+                            reply_buf = []  # nothing was emitted
+                            break
+                        dropped = turns.pop(0)
+                        attempt += 1
+                        print(f"\n[chat] context full \u2014 dropped oldest turn ({len(dropped[0])+len(dropped[1])} chars), retrying...",
+                              file=sys.stderr)
+                        # The session was already reset by the C side;
+                        # retrying with the trimmed history will hit
+                        # the slow path on this turn and the fast path
+                        # again from the next turn onward.
+                        if attempt > 8:
+                            print("[chat] too many overflow retries, giving up on this turn.",
+                                  file=sys.stderr)
+                            reply_buf = []
+                            break
                 print()
-                history += "".join(reply_buf) + "<|im_end|>\n"
+                if reply_buf:
+                    turns.append((question, "".join(reply_buf)))
         except (KeyboardInterrupt, EOFError):
             print("\nBye!", file=sys.stderr)
 
diff --git a/quant.h b/quant.h
index 9a2691c..36cbbb2 100644
--- a/quant.h
+++ b/quant.h
@@ -15695,17 +15695,23 @@ int tq_generate_continue(tq_model_t* model,
         n_new = 1;
     }
 
-    /* Sliding window: drop oldest prompt tokens if the new prompt would
-     * leave no room for max_tokens of generation. Keeps the most recent
-     * tokens. Forces full reprefill since the prefix shifted. */
+    /* Overflow check: reject prompts that won't fit. The previous
+     * behavior was to silently drop oldest tokens via a sliding window,
+     * but that desynced any cached_text the higher-level wrapper held
+     * (cached_text claimed the full prompt, while cached_tokens only
+     * had the truncated tail — next turn's text-prefix match would
+     * map text bytes to the wrong KV positions). Returning -2 lets the
+     * caller decide (reset chat, show error). */
     int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
     int budget  = max_prompt - reserve - 32;
     if (budget < 64) budget = 64;
     if (n_new > budget) {
-        int drop = n_new - budget;
-        memmove(new_tokens, new_tokens + drop, (size_t)budget * sizeof(int));
-        n_new = budget;
-        *n_cached_io = 0;
+        free(new_tokens);
+        if (getenv("TQ_CHAT_DEBUG")) {
+            fprintf(stderr, "[chat] OVERFLOW n_new=%d budget=%d max=%d\n",
+                    n_new, budget, max_prompt);
+        }
+        return -2;
     }
 
     /* Find longest common prefix with the cached tokens.
@@ -15872,6 +15878,7 @@ typedef struct {
     char*  buf;
     size_t len;
     size_t cap;
+    int    tainted;   /* 1 if accumulation ever failed → buf is incomplete */
     void (*user_cb)(const char*, void*);
     void*  user_data;
 } chat_accum_t;
@@ -15879,18 +15886,22 @@ typedef struct {
 static void chat_accum_callback(const char* tok, void* u) {
     chat_accum_t* ctx = (chat_accum_t*)u;
     if (!tok) return;
+    /* Always pass through to the user's callback first — losing tokens
+     * from the user's stream because of an INTERNAL realloc failure is
+     * far worse than a stale cached_text on the next turn. */
+    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+    if (ctx->tainted) return;
     size_t tlen = strlen(tok);
     if (ctx->len + tlen + 1 > ctx->cap) {
         size_t new_cap = (ctx->cap + tlen + 64) * 2;
         char* nb = (char*)realloc(ctx->buf, new_cap);
-        if (!nb) return;
+        if (!nb) { ctx->tainted = 1; return; }
         ctx->buf = nb;
         ctx->cap = new_cap;
     }
     memcpy(ctx->buf + ctx->len, tok, tlen);
     ctx->len += tlen;
     ctx->buf[ctx->len] = '\0';
-    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
 }
 
 int tq_generate_chat_text(tq_model_t* model,
@@ -15918,7 +15929,7 @@ int tq_generate_chat_text(tq_model_t* model,
         }
     }
 
-    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0,
+    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
                             .user_cb = config->on_token,
                             .user_data = config->user_data };
     void (*orig_cb)(const char*, void*) = config->on_token;
@@ -15982,12 +15993,35 @@ int tq_generate_chat_text(tq_model_t* model,
                     matched_text_len, n_suffix);
         }
 
-        /* Generation loop */
+        /* Generation loop — mirrors tq_generate_continue including
+         * rep_penalty (which the fast path was silently dropping). */
         int vocab_size = model->config.vocab_size;
         int n_cached = *n_cached_io;
         int pos = n_cached;
         int prev_token = n_cached > 0 ? cached[n_cached - 1] : 1;
 
+        float rep_penalty = config->rep_penalty;
+        int rep_window = config->rep_window;
+        if (rep_window > 64) rep_window = 64;
+        int recent_tokens[64];
+        int recent_count = 0;
+        for (int i = (n_cached > rep_window ? n_cached - rep_window : 0); i < n_cached; i++) {
+            recent_tokens[recent_count % 64] = cached[i];
+            recent_count++;
+        }
+        if (rep_penalty > 1.0f) {
+            int window = recent_count < rep_window ? recent_count : rep_window;
+            for (int r = 0; r < window; r++) {
+                int idx = (recent_count - 1 - r) % 64;
+                if (idx < 0) idx += 64;
+                int tok = recent_tokens[idx];
+                if (tok >= 0 && tok < vocab_size && state->logits) {
+                    if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                    else                         state->logits[tok] *= rep_penalty;
+                }
+            }
+        }
+
         uint64_t rng_state = config->rng_seed
             ? (uint64_t)config->rng_seed : (uint64_t)time(NULL);
         int next_token = tq_sample_topp(state->logits, vocab_size,
@@ -16033,9 +16067,24 @@ int tq_generate_chat_text(tq_model_t* model,
             pos++;
             generated++;
 
+            if (rep_penalty > 1.0f) {
+                int window = recent_count < rep_window ? recent_count : rep_window;
+                for (int r = 0; r < window; r++) {
+                    int idx = (recent_count - 1 - r) % 64;
+                    if (idx < 0) idx += 64;
+                    int tok = recent_tokens[idx];
+                    if (tok >= 0 && tok < vocab_size) {
+                        if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                        else                         state->logits[tok] *= rep_penalty;
+                    }
+                }
+            }
+
             next_token = tq_sample_topp(state->logits, vocab_size,
                                          config->temperature, config->top_p,
                                          &rng_state);
+            recent_tokens[recent_count % 64] = next_token;
+            recent_count++;
         }
 
         if (output && output_size > 0) {
@@ -16051,21 +16100,35 @@ int tq_generate_chat_text(tq_model_t* model,
             output, output_size);
     }
 
-update_cache:
     config->on_token = orig_cb;
     config->user_data = orig_ud;
 
+    /* Update cached_text only if we know the KV state corresponds
+     * EXACTLY to (prompt + accum.buf):
+     *   - generated >= 0: generation didn't error out
+     *   - !accum.tainted: every generated token was captured
+     * On any failure, clear cached_text so the next call falls through
+     * to the slow path with a clean slate instead of trusting bytes
+     * that don't match the KV cache. */
     if (cached_text_io) {
-        size_t plen = strlen(prompt);
-        size_t glen = accum.len;
-        size_t new_len = plen + glen;
-        char* nt = (char*)malloc(new_len + 1);
-        if (nt) {
-            memcpy(nt, prompt, plen);
-            if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
-            nt[new_len] = '\0';
-            if (*cached_text_io) free(*cached_text_io);
-            *cached_text_io = nt;
+        if (generated < 0 || accum.tainted) {
+            if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+        } else {
+            size_t plen = strlen(prompt);
+            size_t glen = accum.len;
+            size_t new_len = plen + glen;
+            char* nt = (char*)malloc(new_len + 1);
+            if (nt) {
+                memcpy(nt, prompt, plen);
+                if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
+                nt[new_len] = '\0';
+                if (*cached_text_io) free(*cached_text_io);
+                *cached_text_io = nt;
+            } else {
+                /* malloc failed → can't refresh cached_text. Clearing it
+                 * is safer than leaving the previous (now stale) value. */
+                if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+            }
         }
     }
     if (accum.buf) free(accum.buf);
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
index 1f45a35..0211a83 100644
--- a/src/engine/tq_generate.c
+++ b/src/engine/tq_generate.c
@@ -653,20 +653,23 @@ int tq_generate_continue(tq_model_t* model,
         n_new = 1;
     }
 
-    /* Sliding window: if the new prompt + reserved generation room would
-     * exceed max_seq_len, drop the oldest tokens from the front of the
-     * prompt. We keep the most recent (max_seq_len - max_tokens - 32) tokens.
-     * Note: this discards conversation history; ideally callers send
-     * pre-trimmed prompts, but this prevents catastrophic failure. */
+    /* Overflow check: reject prompts that won't fit. The previous
+     * behavior was to silently drop oldest tokens via a sliding window,
+     * but that desynced any cached_text the higher-level wrapper held
+     * (cached_text claimed the full prompt, while cached_tokens only
+     * had the truncated tail — next turn's text-prefix match would
+     * map text bytes to the wrong KV positions). Returning -2 lets the
+     * caller decide (reset chat, show error). */
     int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
     int budget  = max_prompt - reserve - 32;
     if (budget < 64) budget = 64;
     if (n_new > budget) {
-        int drop = n_new - budget;
-        memmove(new_tokens, new_tokens + drop, (size_t)budget * sizeof(int));
-        n_new = budget;
-        /* Force full reprefill since the prefix shifted */
-        *n_cached_io = 0;
+        free(new_tokens);
+        if (getenv("TQ_CHAT_DEBUG")) {
+            fprintf(stderr, "[chat] OVERFLOW n_new=%d budget=%d max=%d\n",
+                    n_new, budget, max_prompt);
+        }
+        return -2;
     }
 
     int n_cached = *n_cached_io;
@@ -835,6 +838,7 @@ typedef struct {
     char*  buf;
     size_t len;
     size_t cap;
+    int    tainted;   /* 1 if accumulation ever failed → buf is incomplete */
     void (*user_cb)(const char*, void*);
     void*  user_data;
 } chat_accum_t;
@@ -842,18 +846,22 @@ typedef struct {
 static void chat_accum_callback(const char* tok, void* u) {
     chat_accum_t* ctx = (chat_accum_t*)u;
     if (!tok) return;
+    /* Always pass through to the user's callback first — losing tokens
+     * from the user's stream because of an INTERNAL realloc failure is
+     * far worse than a stale cached_text on the next turn. */
+    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+    if (ctx->tainted) return;
     size_t tlen = strlen(tok);
     if (ctx->len + tlen + 1 > ctx->cap) {
         size_t new_cap = (ctx->cap + tlen + 64) * 2;
         char* nb = (char*)realloc(ctx->buf, new_cap);
-        if (!nb) return;
+        if (!nb) { ctx->tainted = 1; return; }
         ctx->buf = nb;
         ctx->cap = new_cap;
     }
     memcpy(ctx->buf + ctx->len, tok, tlen);
     ctx->len += tlen;
     ctx->buf[ctx->len] = '\0';
-    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
 }
 
 int tq_generate_chat_text(tq_model_t* model,
@@ -897,7 +905,7 @@ int tq_generate_chat_text(tq_model_t* model,
 
     /* Wrap user callback to capture generated text into a buffer for the
      * next call's cached_text update. */
-    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0,
+    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
                             .user_cb = config->on_token,
                             .user_data = config->user_data };
     void (*orig_cb)(const char*, void*) = config->on_token;
@@ -971,12 +979,36 @@ int tq_generate_chat_text(tq_model_t* model,
                     matched_text_len, n_suffix);
         }
 
-        /* --- Run generation loop directly --- */
+        /* --- Run generation loop directly. Mirrors tq_generate_continue
+         *     including rep_penalty (the fast path was silently dropping
+         *     it before, leaving rep_penalty inconsistent across turns). */
         int vocab_size = model->config.vocab_size;
         int n_cached = *n_cached_io;
         int pos = n_cached;
         int prev_token = n_cached > 0 ? cached[n_cached - 1] : 1;
 
+        float rep_penalty = config->rep_penalty;
+        int rep_window = config->rep_window;
+        if (rep_window > 64) rep_window = 64;
+        int recent_tokens[64];
+        int recent_count = 0;
+        for (int i = (n_cached > rep_window ? n_cached - rep_window : 0); i < n_cached; i++) {
+            recent_tokens[recent_count % 64] = cached[i];
+            recent_count++;
+        }
+        if (rep_penalty > 1.0f) {
+            int window = recent_count < rep_window ? recent_count : rep_window;
+            for (int r = 0; r < window; r++) {
+                int idx = (recent_count - 1 - r) % 64;
+                if (idx < 0) idx += 64;
+                int tok = recent_tokens[idx];
+                if (tok >= 0 && tok < vocab_size && state->logits) {
+                    if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                    else                         state->logits[tok] *= rep_penalty;
+                }
+            }
+        }
+
         unsigned long long rng_state = config->rng_seed
             ? (unsigned long long)config->rng_seed : (unsigned long long)time(NULL);
         int next_token = tq_sample_topp(state->logits, vocab_size,
@@ -1022,9 +1054,24 @@ int tq_generate_chat_text(tq_model_t* model,
             pos++;
             generated++;
 
+            if (rep_penalty > 1.0f) {
+                int window = recent_count < rep_window ? recent_count : rep_window;
+                for (int r = 0; r < window; r++) {
+                    int idx = (recent_count - 1 - r) % 64;
+                    if (idx < 0) idx += 64;
+                    int tok = recent_tokens[idx];
+                    if (tok >= 0 && tok < vocab_size) {
+                        if (state->logits[tok] > 0) state->logits[tok] /= rep_penalty;
+                        else                         state->logits[tok] *= rep_penalty;
+                    }
+                }
+            }
+
             next_token = tq_sample_topp(state->logits, vocab_size,
                                          config->temperature, config->top_p,
                                          &rng_state);
+            recent_tokens[recent_count % 64] = next_token;
+            recent_count++;
         }
 
         if (output && output_size > 0) {
@@ -1041,24 +1088,36 @@ int tq_generate_chat_text(tq_model_t* model,
             output, output_size);
     }
 
-update_cache:
     /* Restore the original callback before returning to caller */
     config->on_token = orig_cb;
     config->user_data = orig_ud;
 
-    /* Update cached_text = prompt + generated text. The next call can
-     * fast-path against this if its prompt starts with this string. */
+    /* Update cached_text only if we know the KV state corresponds
+     * EXACTLY to (prompt + accum.buf):
+     *   - generated >= 0: generation didn't error out
+     *   - !accum.tainted: every generated token was captured
+     * On any failure, clear cached_text so the next call falls through
+     * to the slow path with a clean slate instead of trusting bytes
+     * that don't match the KV cache. */
     if (cached_text_io) {
-        size_t plen = strlen(prompt);
-        size_t glen = accum.len;
-        size_t new_len = plen + glen;
-        char* nt = (char*)malloc(new_len + 1);
-        if (nt) {
-            memcpy(nt, prompt, plen);
-            if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
-            nt[new_len] = '\0';
-            if (*cached_text_io) free(*cached_text_io);
-            *cached_text_io = nt;
+        if (generated < 0 || accum.tainted) {
+            if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+        } else {
+            size_t plen = strlen(prompt);
+            size_t glen = accum.len;
+            size_t new_len = plen + glen;
+            char* nt = (char*)malloc(new_len + 1);
+            if (nt) {
+                memcpy(nt, prompt, plen);
+                if (glen > 0 && accum.buf) memcpy(nt + plen, accum.buf, glen);
+                nt[new_len] = '\0';
+                if (*cached_text_io) free(*cached_text_io);
+                *cached_text_io = nt;
+            } else {
+                /* malloc failed → can't refresh cached_text. Clearing it
+                 * is safer than leaving the previous (now stale) value. */
+                if (*cached_text_io) { free(*cached_text_io); *cached_text_io = NULL; }
+            }
         }
     }
     if (accum.buf) free(accum.buf);
diff --git a/src/server/tq_server.c b/src/server/tq_server.c
index 81db519..711557b 100644
--- a/src/server/tq_server.c
+++ b/src/server/tq_server.c
@@ -109,6 +109,12 @@ typedef struct {
     int         cached_capacity;
     char*       cached_text;          /* prompt + generated, for text-prefix matching */
     long        last_used;            /* monotonic counter for LRU */
+    /* Track the kv_type / value_quant_bits used to allocate kv_state.
+     * If a later request reuses this session id with different params,
+     * we must rebuild the state — the cached KV blocks are formatted
+     * for the original config and would be misinterpreted otherwise. */
+    tq_type     kv_type;
+    int         value_quant_bits;
 } kv_session_t;
 
 struct tq_server {
@@ -123,7 +129,8 @@ struct tq_server {
 };
 
 /* Find or allocate a session by id. Caller holds inference_mutex.
- * Returns a pointer into server->sessions. Never NULL (LRU evicts). */
+ * Returns a pointer into server->sessions, or NULL on allocation failure
+ * (caller must check and respond with HTTP 500). */
 static kv_session_t* get_or_create_session(tq_server_t* server,
                                             const char* sid,
                                             tq_type kv_type,
@@ -141,8 +148,36 @@ static kv_session_t* get_or_create_session(tq_server_t* server,
             continue;
         }
         if (strncmp(server->sessions[i].id, sid, SESSION_ID_MAX) == 0) {
-            server->sessions[i].last_used = server->session_clock;
-            return &server->sessions[i];
+            kv_session_t* hit = &server->sessions[i];
+            hit->last_used = server->session_clock;
+            /* If the client switched kv_type / value_quant_bits between
+             * turns, the cached KV blocks are formatted for the OLD
+             * config. We must rebuild — reusing the state would
+             * misinterpret quantized blocks and produce garbage. */
+            if (hit->kv_type != kv_type ||
+                hit->value_quant_bits != value_quant_bits) {
+                fprintf(stderr, "[server] session %s: kv_type/vq_bits changed, rebuilding state\n", hit->id);
+                if (hit->kv_state) tq_free_state(hit->kv_state);
+                if (hit->cached_tokens) free(hit->cached_tokens);
+                if (hit->cached_text) free(hit->cached_text);
+                hit->kv_state = tq_create_state_ex(
+                    &server->config.model->config, kv_type, value_quant_bits);
+                if (!hit->kv_state) {
+                    /* Free state failed → mark slot empty so we don't
+                     * leave a half-baked entry that future calls would
+                     * NULL-deref. */
+                    fprintf(stderr, "[server] tq_create_state_ex failed (rebuild) for session %s\n", hit->id);
+                    memset(hit, 0, sizeof(*hit));
+                    return NULL;
+                }
+                hit->cached_tokens = NULL;
+                hit->n_cached = 0;
+                hit->cached_capacity = 0;
+                hit->cached_text = NULL;
+                hit->kv_type = kv_type;
+                hit->value_quant_bits = value_quant_bits;
+            }
+            return hit;
         }
         if (server->sessions[i].last_used < lru_time) {
             lru_time = server->sessions[i].last_used;
@@ -163,6 +198,17 @@ static kv_session_t* get_or_create_session(tq_server_t* server,
     strncpy(s->id, sid, SESSION_ID_MAX - 1);
     s->kv_state = tq_create_state_ex(
         &server->config.model->config, kv_type, value_quant_bits);
+    if (!s->kv_state) {
+        /* tq_create_state_ex returned NULL (OOM, bad config). Clear the
+         * slot id so the slot looks empty again, otherwise the next
+         * call with the same sid would find this entry and dereference
+         * a NULL kv_state. */
+        fprintf(stderr, "[server] tq_create_state_ex failed for session %s\n", sid);
+        memset(s, 0, sizeof(*s));
+        return NULL;
+    }
+    s->kv_type = kv_type;
+    s->value_quant_bits = value_quant_bits;
     s->last_used = server->session_clock;
     return s;
 }
@@ -779,13 +825,22 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
         kv_session_t* sess = get_or_create_session(server, req.session_id,
                                                     gen_cfg.kv_type,
                                                     gen_cfg.value_quant_bits);
-        int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
-                               sess->kv_state, req.prompt, &gen_cfg,
-                               &sess->cached_text,
-                               &sess->cached_tokens, &sess->n_cached,
-                               &sess->cached_capacity,
-                               output, sizeof(output));
-        if (gen_rc == -2) {
+        int gen_rc;
+        if (!sess) {
+            /* tq_create_state_ex failed inside get_or_create_session.
+             * Synthesize an error event in the SSE stream so the client
+             * doesn't see a happy "stop" with empty content. */
+            gen_rc = -1;
+            LOG_ERROR("Session allocation failed");
+        } else {
+            gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
+                                   sess->kv_state, req.prompt, &gen_cfg,
+                                   &sess->cached_text,
+                                   &sess->cached_tokens, &sess->n_cached,
+                                   &sess->cached_capacity,
+                                   output, sizeof(output));
+        }
+        if (gen_rc == -2 && sess) {
             /* Context overflow — auto-reset session and surface error.
              * Client should retry with a shorter conversation history. */
             LOG_ERROR("Session %s: context overflow, auto-reset", sess->id);
@@ -797,7 +852,37 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
             if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; }
         }
 
-        /* Send final chunk with finish_reason */
+        /* Send final chunk. finish_reason: "stop" on success, "error"
+         * on -1, "length" on -2 (overflow). The previous code always
+         * sent "stop" even when generation errored, leaving clients
+         * thinking the model decided to produce zero tokens. */
+        const char* finish_reason = "stop";
+        if (gen_rc == -2) finish_reason = "length";
+        else if (gen_rc < 0) finish_reason = "error";
+
+        if (gen_rc < 0) {
+            /* Emit an error delta so OpenAI-compatible clients can see
+             * what went wrong. Most clients surface the delta content. */
+            char err_chunk[SSE_CHUNK_SIZE];
+            const char* msg = (gen_rc == -2)
+                ? "context overflow — session reset, retry with shorter history"
+                : "internal error during generation";
+            snprintf(err_chunk, sizeof(err_chunk),
+                "{"
+                    "\"id\":\"%s\","
+                    "\"object\":\"chat.completion.chunk\","
+                    "\"created\":%ld,"
+                    "\"model\":\"%s\","
+                    "\"choices\":[{"
+                        "\"index\":0,"
+                        "\"delta\":{\"content\":\"[%s]\"},"
+                        "\"finish_reason\":null"
+                    "}]"
+                "}",
+                completion_id, (long)time(NULL), model_id, msg);
+            send_sse_event(fd, err_chunk);
+        }
+
         char final_chunk[SSE_CHUNK_SIZE];
         snprintf(final_chunk, sizeof(final_chunk),
             "{"
@@ -808,14 +893,15 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
                 "\"choices\":[{"
                     "\"index\":0,"
                     "\"delta\":{},"
-                    "\"finish_reason\":\"stop\""
+                    "\"finish_reason\":\"%s\""
                 "}]"
             "}",
-            completion_id, (long)time(NULL), model_id);
+            completion_id, (long)time(NULL), model_id, finish_reason);
         send_sse_event(fd, final_chunk);
         send_sse_event(fd, "[DONE]");
 
-        LOG_INFO("Streaming complete: %d tokens", sse_ctx.token_count);
+        LOG_INFO("Streaming complete: %d tokens (rc=%d)",
+                 sse_ctx.token_count, gen_rc);
 
     } else {
         /* --- Non-streaming --- */
@@ -828,6 +914,16 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
         kv_session_t* sess = get_or_create_session(server, req.session_id,
                                                     gen_cfg.kv_type,
                                                     gen_cfg.value_quant_bits);
+        if (!sess) {
+            LOG_ERROR("Session allocation failed");
+            free(collect.buf);
+            pthread_mutex_unlock(&server->inference_mutex);
+            free_chat_request(&req);
+            send_json(fd, 500, "Internal Server Error",
+                "{\"error\":{\"message\":\"Failed to allocate KV state for session\","
+                "\"type\":\"server_error\",\"code\":\"session_alloc_failed\"}}");
+            return;
+        }
         int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
                                sess->kv_state, req.prompt, &gen_cfg,
                                &sess->cached_text,
@@ -852,6 +948,20 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
                 "\"type\":\"context_overflow\",\"code\":\"context_full\"}}");
             return;
         }
+        if (gen_rc < 0) {
+            /* Other error (-1: invalid args, OOM during prefill, etc.).
+             * The previous code fell through and sent HTTP 200 with an
+             * empty content string, which is indistinguishable from a
+             * deliberate empty completion. Return 500 instead. */
+            LOG_ERROR("Session %s: generation failed (rc=%d)", sess->id, gen_rc);
+            free(collect.buf);
+            pthread_mutex_unlock(&server->inference_mutex);
+            free_chat_request(&req);
+            send_json(fd, 500, "Internal Server Error",
+                "{\"error\":{\"message\":\"Generation failed (allocation error or invalid state)\","
+                "\"type\":\"server_error\",\"code\":\"generation_failed\"}}");
+            return;
+        }
 
         const char* content = collect.buf ? collect.buf : "";
 
diff --git a/wasm/quant.wasm b/wasm/quant.wasm
index 061f952e4bc7606702bfe3fd7ab93c1d319541ef..f018484ec65b5e37499d72dfa881b5ad97eade97 100755
GIT binary patch
delta 16558
zcmb_@3wTw<wf~-(eO@`|Bq#63AuxM`5|Ho`11ey)Z6L$|QmLRt#MTN8s6^xA*2hVa
zf0YWB+YN4RYZa{Zi3&t}e9($k#VT#lqN1XPfEWca3W&Bu?r+WB=Oj5Kz2DdG58uhm
zp0#Gpnl)>!S+i#EKXv4`9nIa4#W{=G5&k4!dq>NkxsUPBSJ<zLNo=p3s~53$yFx$N
zx8n<D?Xb_&?_#^`4t;w6rH7dmlk{rIEmRvMtKynuQD*G~;!b;ApuleQ-CTCy3swPu
z511C}QUJ|r1xl-Zk-ucTu-X;E<fz};g;tHgXi}A&um?2*E&%{s0))B?1g3awJ>uV~
z7@}eNWN6tL=q71|oUqia<7_IBHS25EXdU~4H5i;HEX)$@nz{u<OI+JzYK=AEe20B$
zu*sx(8ahw#YG7iTgy{z+Ycjb)+iCwZ_$S|CFn74+k0FDz4*SFKuh<Uzs>t|K8pDK$
zD)`bbnaL}7&{R^{VQ-DpLa4FmFc*iXaI|hX-;bUbR9rHv-QEx_1JT!_<A;(&0wgn<
znh%voIGkcFToYw7yujj381k?^DtkZMX_x0rp@#)Ik5Ew<I~OBg7P~!){xlsQWP$;{
zja@Q<DAJ<R8c2=`Bob^ev`8o{{j`LH&!+$%b;+fFp{dZZ%GLq<uG}*d>yHo)5S~z#
z2n$<J9bsoD#)9YuRGQZ|<DX8CK$Tfmq`_L;XzhIX&4+zg)5u$ncp9-XRcV%C6JpdY
z2y&qrXf&DhPmKnx<!S`ehryP*)u6P`&nqy|vr^M6oz@5Irv8&6s>uR-)e(Dn-WhCz
zy)~~g>QNz<mmaZM{yg@C{lolI{WSc_Bld6dCyWDiZv?4hIf={L)PPbWQJ}6gEX<X<
z(<s6nB~~yvvDY0}1NWNbQ{$vsXj9S+TbiL+YtUaOqN`9Lp<AR)395XJiYrliTI)$f
zRFk4MK@Lo5u?Z!Beo3Vbx1pk0q;{5^9M$@V)l%tJ#X_`PFZn`?B)E|JXjroy1%|H!
zqinYSSui}VtQA;7VQCHS>bj=;ox;H$=9PtI1Kepzeopu4KqK@9!$$&su&|P&c)M`K
z*`z8bj;}PB#>1T5aZn3b?5ZlRlcXA~3e+r@x7wo0s=JP`AzT~FW0*B0rRqTot#}SL
z&}>c=<k}<lBSk+llo4rNrp{o%T$#FS!#nKP%ggL3#pg+7?u=JsDLrHN0at4>l`_G{
zZ7zPw@a#Y%HB^ob{AOChDY_oEw%ae3%t*OIy%%>lqBLFxQ-jJ#1<r7GBT9?y8%izD
z;XPmata5mSi8(6|tZbC0J@Bu~9_w|ofnFvXw(p3a>rPnfK4B%CCe!D6w{PRGm0>95
zj3G2NDw%-PXjFSozrXf6qR1HWn$j-O(lV1wQx5dv{%@!uI8DrV!Vu>SczK|^g5K6(
zFf1jLK{Z4`x=XXe9wSf7A^|GR-qmuoToHqfvr)6yQ!N*a8o~q1-qU0-*3UVsKPzro
zd+Gy%KlD+{l(A3zh(G*h%kf`d#|k@MVx2QJ{PhY>PxkG1)%5}S^seeYAZPuyuReGG
zwy!?cg9Ucr?y5d0t_Ae6yNC4wec<lC#`E7ZI*u7N4AZnttH#m7Nhu@BL0&sd>u5Re
zp7R;|*nW7e$@bZQUE801WFJ^tjyRzDzCwGzx**$QSF9^%ZT5_N3qwKGTUfj8EAd2z
zDy6`_zGU5K(0Fnk(b&0;XdFc!MPu+t4{Of>tpUTftTK(|(!fFo8{^~nFb_)f%Nfxv
zXRY>%dnbe5oA(mEukR&##lK6@JNMtxt54HwM33F}FY&k4e)M<5U+eG8iTg6S*rE{|
z*L&SQ$^=V-HhEB*P&ouL7k^!WEbt&oAok<;1u@uD_to;1OYEOLRA%@4{U}shrvE;V
z^;`r$1Q$pzf_>xtwS6q?JNNgsu;U)+OOaMQ&{v-WefF^)?5oe(2m2c5NS}RXK6HvJ
zf^(>t{`;bqy$^B5zO+Anc<{h4{>V~F5P$}5W@f-rVlI#ZV)0kP=0-hI#tzx_k4VH*
zmp`%qJ>Pm{=)^vIj!F0IM%#+f+Ko(s4%rje_fzD)XFs~W2qa%zKeTUVW~Tae*tw4q
zGs7ND_5RJH)O*vTJ$mczRCISLO<lc@tKMagQSY&j{XC@_?>t7Tp=4t|&e<tP*Qcd2
z?(EZUd%!xbHJW~xXvorPF_jz1+A6iL?CBc}cEoPnP{t11%Qp;UN9_$8it|a52q_42
zjw{En>~|IBF@;&YabV9;279AK+)<)#-|?<$er5l7qtWk37gtetW@D7bLw66`_hu55
z9>F}9V8kWZ4+MS0c1KTwk<_YdF2Z5Ea#J~k#9I2<^EXZOYA<-T3t30YoR(IB<_XCc
zSjVQP#%Hm;EfZeY&ZFCo(jH2&Z<e*iE_-RQy?9#%Z(d>ldfRy|zj&#b`*t0r)5HB+
zXR>4VzqVc)IQB9#qSoG)%U)?<pz``({=m9XiGj+h*LJmhwryDu(CgmZ&*G^uG#R$G
zsPPb9sQ~Vnebd`jy!oj8<lBR&c<=2|#mz?%`M!uj@dIlJ$S6X2#U8kG{Mc8}jGx=A
z;z>koE=iM3iz3YR!1Ym0RHsn4wA{J#e!-WnuqW=igo<^$E@s<X*!z2VaPMbC%?#T&
zzgcYStw)S5W+UC0J@yr|vWO2bS+;v{(QQW&a^b*XVPCZPIdK9lKWfj~U0$@(1;rFd
zcR{O;+P?s3jSDJKAioQ`>!|%lfY!PosX##&bl*|C1E2?8P=x|zxuEq&?SXAZ;RY8p
zQh_2a=n4CRw*G}rxsY)RlI=n^+soT(`;_ngw*OqdbN9^jL82|~rxxTs^%E0ohoy)c
zh+ZrB6D>!N-NfPCuQ^_oy&VDbND@{8$JN%Br;kTC+wQ!~*!k=Ur<}7NMcvbe3{cG1
z>a5^wjHymc)JX-|sL<KS3;q<|E@uyCOH~LHB)nZtqhNJz@aJq1%#HpCH5G?Zq$7Q)
z=r8#{I$M@Ct#v(HB;i*z8N&WzB1&o;rllGwYU0<Xn;Hm_qfO6f6KL}zJZN<m=di*v
zsJp`T_0r$pOaDN+|8wY{NcRu*(m$)0{^3;r)W8v%jp#NT#ZI&ra@i-6OK3Tn^kPI$
z&!m^z3%$G(>1p|y<O+zKFOyuMNA4_d1r*Okz0fPpq*p@p{AS2qDJM4<2h+3=%Vg+;
z1&gCiEf@(*jZ+RkmATaY(U!T?1JI^=CYR>2PCo4Laq=Zb(CSbX$!ei+!B{E0l4y#R
zy8t&!a5(vB0lQwX!^x9N*j``$w!h$9waD<xT<MoFGMnwm^8TzpW22q%1K780bn;Be
z21VFjXWlS&jX2fvI}?Vp5~pJr+kwWHD%mycROig$>>!?oj-aRH+7V2Li*|lLlC8$$
znWI=|2#=?q$>!kk&NJC>2|8v1n~%r4Ca^que0-vUM$TfV(c{_G?8g|bz{x&~l_X!N
zW^WH+8<M|nVE^=?T)mi$&B@!osPlZjeGyZCIsJ7$NU(*ywf~^G_x7EAw159+z2m>p
zNBi_8z2k3N!Ww(wCprBV=4ZVCocT*3mF>yDFJ)szZ}5`adLL+YAMJU|ddL4|AMKG}
z^p3ym7wq!h_zB+5uI#NH6YTFia63ED8xWHm>+D+2Mq;MX6|9uBWAL4<84F&n7TkH^
zPF4lo`{qt|AC5Z7yVtNQi-DBAK`GeM4Qy>T)b6C`*klx!KgZ4h@WJQU<!o=VXbXGJ
zkH@!OW<P;`Rc~c$bT-;~=QVaS9%pZ3Lj=Y-#hLLYD@d;SI~xXQ@}+;UTNuCPeW&hC
z_8zcLev1u2^Q5<69w_JUWE1@;@Be@mGxQ$gSp8X{^X7-_VvKlND|?dXcpJFMqZ}ut
zEzUP>YzeyDxQG2zpgg*dU7%ynnX#W;$~HTH*w5sd_pLyJ5yw00IR1&o$)*<Nl#|+W
zP=y*<lhB}9OVyK!7(i7fsU|1)0Iamb8F_$}*KR(DbWjRQ(UO`94bp@R4Ri-Gm0m+0
zpf94U4=`zVe1@1KcB91xBC#+jD$2rP)zcfI%-WoM`vAL<v36(XK{m+S?ru8BkaTv+
zK4)&$S)*M~cT)qDPL?9I>p8c6&TdI~`)xYqtdevpZ{=oJkPlPIYKpDA6qC32{vTQB
zU{@Ae(L{!n@~$N4ZZw@`9c-M(`pX@xTVlF*o;QEN+{`pFh0HYbGSjTr%(So=IbUR^
znVXq*5?`_=H+93ir*0^Pjl5vMP2D8(53yTAgLkS((+FET+)BjSt}4<;jy(d$9*&h-
zJDlf_u>n~Cx%rcw&Zo!N5U688C(CEu;>l?5@uXAxEgLK;p!B-|WuPmdRBbp5$bfyx
z?tB$m0*OM)fRzp{ou|HKZfpq{#g+jp9a}p42}Z#sU{b--!KKsxIA9cA0!Gngz)DA#
z&Wz*C4KD$s@G@Yf!b|7Y<IIgO0iyUaV5Q<q=b7WI&j53EDgTcH%+<^JAhvz=M!sP1
zQe~@Ee2|5jn`viZlZ@Az6o+yCY~FM|VQqJ+8IQxv=P+Id?{*#IcL%eMZD$6wuYz|Z
z2W0U;765|__#z5HR~7LQczm&lUrpX&T%13T;-7JTIa(L=;|mxbUoYqP@CUa$SM}!)
zvyNo>0DdlKTas58yaC~_^UlfK0e0hHej!LEw+-PRF}Bg!bPAt^B5N2YKYQjd{sgsl
zRPsU;{^7hoij#)(8&Iqp&VP?$@(6CDcy9!M2*v*!$xlP^>_~oj;l{r)$^@MQhs09u
zV0&`HD1JJ)PF{B!-;u|*IrX)?K7u2QO0JV1bC6f5#R1DXSj*|4<rGZer}lLcaOO_o
z6Y1=fIzLfLoWknVK}%}&I4`EasCPSRDFdmEYT>XwFInZ}*YT55b`(n4QC1H-%7Pt*
zlpS4M$9ve(ZFQXNXj2{kxbH(G68ShpmN|c`=ly7A2kPA!=1t{&o+X_Nr~c=<==^mm
zFXaz@;<QiYSHbiDU>d)MH{b5OF^!L=(m$OWXs(#f%~2gocmp<}=4G5#?F<d4Sp(*6
zU^m0LHsTATR5%PU)W2|Un9lo0dp6xaosaF;qlu_$m73S-OXr*E{Osy~hv9YL?kI=6
zM?nnf5gt}-I5(WjPYHpFA&4vI@pE}yUI)Ab^8F1saUm^(0Xv-V3|<jNdxNDW)Zt8=
z0c&V?KAFLr_}V4T#ovP*P}<+)#_aYbm?(z9t|tTDT_CPwmhetZ$1$rj$IR;-Xy$gF
z7uCwFn6<$JbZeB>mJ=JdCnM+av21+n5)QdyCoZ#|2RUnZ*YgYX+}8E{XL{b#^}IKk
zn8`=^*gj|D1^lP{mVHj;g?uLa+*xuVzX)-8>xF!3jJ=c`zL0+hwx7Ie5r3cat%IBy
zS8_9fv%dycLMZ@)QxSpGP{a%$6$OS)MF|3W6#a}K719V%G04cGqQVGMG0cciG17?2
zz)(KMD50luMm81WjT|bfjTjZ*HF621)+j|$!Rw8DYCYE|pw^j289mK53aOZ56j3qH
zD3$?d<8S!zh}Fs$npuNz?lyw<7o!V>X2zT=7V|lgU_@XSh5g~+;=Hw(4+_U5u2TEa
zAL#@dxjZjcgK&tRgGK@F58!J-+YHr^*ww>qN}eI*LXV-?SkqTy`rRj=3`y*Zu`|t3
z+z<eVWhfptbi4K0qJ*eM(g~hrrpzKoj_kCBqE>2Xu3lKh>yaSW>I@&0r^kO>DN7%M
zB0lLV-3_L|d`QN?wdn&RhTm#N-c0)A;EUtrWkzeFS>Qu7n8>E#!crChoQUH5ro|d@
zR-{p&9!Lqq4P4k_ekEy2h$2A=E9sZI41q!e4y&?3hU<*%u*~kkhm69RX*8#L%tB2j
z3kzt{kXo!XY~74Q;D*y<qif*66A{ciT*ayISMeZ<5MAMAsXI-I^<-WRW@x?oH%KJb
z&(hpLFj_{isOSEP*_qI#pcQK{{q!`a&J1F_FbM%%qo|ML<{g4K&kWZn7}$(N5|}6r
zJO>5Xm<cx!O3eo3Fk0LQ3Rl3GdJyD6M1~tob=Yr`(FRz7nT@T%WfoB?*v0WRM`lZZ
zY*s3A%~lKpLOBd<Cc(*24;D_OXF!3|Y?7UljiHC7o?=8rysl&lP#}VhK|5j%hVaHz
zN?>i>j=&Zo;@w1~C#26u0+}6#4!NPT1~b)$!oaXh30Rn!Yi-e{L3wae&~W$;#m(%6
zqz5v)4k~LRh@EEYQJos-R~t8l6_|=9Qj2WhQ-Le?g;w<xGlW)Q-Pj0vv>?2x0hv<G
z@C#D(R;>YxF}{(SWfau}DrTfp<638!oFSTUN$y8(lzeSyk{UM%m4o{T!AiX;kosWu
zQgjoM%(Dc^-deLwvrdZ*wFrnaLnL)(aE(KdabP==pzn((FcoDT*)?e!bH$Cs7>A8u
z#R_0G*}#Tv%>zLg3rwaCi-(O2%5TBDr(kr@>25kYMA1xA(&G!wD5hU;=*mu!_|qT&
zJrTDgAm{Y5-q}!vE>mF;t6KUB6j6kJD4?(sLX`~$^Atf>0=NgZ!k`GGKoCHVYby!|
z(^0k(RXSk`c(OW-?VZP@4!FR&;a3Y^#jCMuQ8)$OWco?bVN2Oc;r)}bG7J!?a!3k#
zq7@^@fpwo|3i6gAEK$mKAV6Al+A@?LVzqo=HsP9z4~jd>B7X*cV<}SsL81@pSd6rx
z(3av$J^5gYWc1Kz7=1kuRc$6bms(+npDdroHg(v36k>#6Q{6SpzspZW0_k=#GV%$K
zG<<@q5#&ed-3&+svpJT99UT~0gxOfqE|VgQDpnTd<M2T6L0dFg3|%vG$*RIKx6Z^v
z5PJnrkQ+w}{XDZm0-qXuaVi?fq|l-aifqj_C_e_&u{5qh=_!NK!Jzavq^${RN7kfI
zHY2V*`6elo#XC8$X-Xh#iLfEkgoJgLYnk3SSS;wXS_I6GteG5F9R;?e@Fhk*xG(*V
zrZt~67aJk8CYOGg2pGilz$JVqa9lnqE1bBx!5f6!rq4=fh?8L-`E+rSgTTPlYlvHZ
z6TA^-1NMDGfmwo4G;J<~Um{CaQzTX@OIC#uD)?jxuBB7VN##1iFb-{qs5ay0>Nm$0
z`(!*kR?H@+7efye1qc;rG-HMj%<a_hl9p#de&xQw5C$rfdhC45?`mbZ%*BC&!L&%-
zaLj}!N!t!|6t5cjoF6pt@*iRjI;8)pHi>G)uWtRUJ+RP>Siz}gp{2oRM4GJKqA?Bx
z0qBAM&H*%A2u)x%NPuD|p9UD01vN$_JxT<l6o9WQovlq=Cd|Sbvj|OKr`{;?<`7VG
zh{z&Yn5%16z?}egAWxCWRHU!Yj3`|uIl~954Oe0<Xpw0zQrIxlprS?_nyxmqpi55e
zzzH{r)0++`4S?PY{4cJH;{`0*WN@E^msMmWZa@a4!)fi%p&JNJx@ia9|6y{Pj6gl8
z0;QlulM$C==tUx>7-mr~suA&2qY&c7q;%R<=uI@%fJnyENpCzvpgpz2)rvw_OH#@a
zQlij3<tU7upRg!efX|AiwFBJJ>sTqPacxGaOD$FtZp>W-cbCuD!Y~IYE;j2bJ{~pP
z<BTIWFGCBf_!!g>8jS?wBhx#7!pb6+tXJQ_fbjTPE;n6U9(-Ly$E1Ztp$v>&xv~>2
z-rK6lD0|`FSBq^Ovh(yu*?tHsSDNT<{Io|>aL}vqlj-+dUb-*F;)Y}tt|G(sqn_<Y
z|Gn);6aT64yUXkExx8U-c|*$f{jhyZm{vDfX85~ohwR?OP6-9Z$$`U*k~=k`PW-2Q
zWI~65EA!S}^M-Nzm2sy`JL;M?88)J`0@&1*KI_3xv@lGh{q$HR0J?Q3RrwYb<prP?
zwMKqe=7XiKpx-t82r&m!jZC^K0Ud&U$$YZ>h|7Lf+VWK>IH#7lhg!Lo53449?Zxt=
zY0LK`Fr8&ZU2-9Be<lSuVf_=eao6%`E5K#|%l9kGC;O-4L)^%R!2I1Tp9Iu%U(j9E
zz)=I+LdxoCcY_iCdyRlZ-R&S>*|{4@hTUcM_iEx186}4h&2$JUo6lB4@Iy~&Lx?~r
z=(q!=2z4oiCqe;21=uNj*gV3du=A%Ic~K$@J{lQ$yk&rctciDaaZ04K6nf!!CTai)
zVO@}7kRcUaK)}4l48uCmquRuI2eAQu2#oRewB&9Gf-ts0r6EWc=9xJ)Q9<bpH)csU
zSsDe*UTaRb2#6w`c`1N94w%4U7%st?eG@;qkj~3#1RSCp2I0crnf#m>gSSj^swWCK
zz|#Q+hjj9OIF6H2;T%sstRD(^E{OuK4-7(qn5hCqqzdnE1t`WEi30031_h2%<51*~
z3k=9u6@(CwxmEnTC`!lSWE_zB6tV?mK~+DTmjkkF9G{7zkj_s5Sww+bKo(ae=iZED
z3vWK?eAvv3F2+qlY<`q`7Z+i?yAA24?lZ8QNbRL=I@~J>B=^*77bvrky&*$ir7)1}
z(5#$P&@I&Qv?Dq5XZ%nWU-`N-WjP<u*S_xDx}4vSGC5)e|2bYeN&aCaUr9-n$-m-b
zqx_ezCY$f!^EqGpx8x`5_)+9ele-__W3u#H1k;_zo)d9<$kX|LUE*3MVn6?pnGoq9
zK)tJi6mHBwl+B@z-5dg%k$F)LL9r%jzvbi&yrXma4uz7y6i%SfXil-Q&|N&>Ept-B
z3a*=a17t7-`hYWCKhiVSWHr-Picp$k?<W~y@pHn?#!dXxJcNt7A}2LhB`E<qx&Rsx
z1Vy9*A_MtAx@NLs$se`wtA>c%nz^%c7jMXFZsrt<Yt}KR{(U}waJzaZg&H24V67Z$
zEp4WLal<`kGbQwu==|>c$kkEh+xL0RxK@>Vg>6CaHS!lIOpUxNE}<|9nrdcqEFS(Q
zdBX>M1?zw8ICGi!nktkTOtXdYli^nW0*j)ikir%*|L$&HrDl=e#_N#(p4-OfqWEJQ
zABE!cHvU(B+Y0ATdw3Db<Xe0A8XmjNy=r4P-C`F3_~w255w_JC`Z2$rwK~81m`@8m
z`z~0<jvjR8w(}AvuN@h<*OTYB^HE{`^F7Y8Lwq6o#yN6`f5vb9*x7rSpTY0=$dN}V
zExXS-^9a8Y*N#se;a7xyz0;+X=WIE`4d>ji_;*7a-f`RVsck@V^H+QvXKy8se9hN$
zwmW(6zxW$Ed;ZC5#6e!#zMq+=$<a5;(-P+B2OwK=#1*{xW#>jA9>*xdbn!IXmfWX{
zHqJgs{v{yBaQ0R5+mP6u&7MuRmWbto-|~iYML+QYd(4?yE*A48|42SpF7Qgs{^SHH
z9`4WfCEF*8rYyEOdC?S6$JoD;57vpBbihxZF3t!dp?}FNQHtV*S)v@p|I896vmMTs
zS)!<5$5uuQeeEKo3=3ru*t2#dKb<A;BJMV4!v(6_s~3pj!2jj~>Xwujf@St(^4%YZ
ziCMh)-DKHi;sF*qh71Y`eYGR!i2~<?`QjW(a-Xt5^k;7*>lTQG0_*dKh2nCo)}TdV
z9KZMD<lIH#db;X(`w9{E@pbLVy;su&UUf!XBbwQ!<c4d+^MbGb#JT#v#0!u|)lbFI
zAiupWIpP-4sPpB2Pu_d0c$Tpjl5>}d?-#M7$tUg+n<D()9nOOFVluzyZD-?p?4{5<
zXUBSBBw!k`kb4{9S_M*j$N+G}(yr6TH#Kl>P;|ss7GCbMkj=znY<k>%Bj3FphU&SO
z84}(PXTcZg`vB`Uek=@Eh?KYbGy^vvXVWJaF?@1?e23Mp#ga{rifxcX(%L9ybN=v$
z&Tm>ot+IiAEn;*8R^eVrm09h{K~IQhl*y3cX_J-7?0-^>8wyiHlflUC&|RozeS2Kg
zgC_vo?;(`!!Ty}7e-O`3c4u0HJuEd#y2Hg-prZCp3pQH`i)D>y$I*hGRoa~>x>ilZ
zRz59;DPC_*ikg)BIn_Di88NoI54zzQVY08CyPpwr%TI#vVm=~+v54R9+&Vj-!e#mY
zckZ5h`v2xG3GU7h@s)QtU;SAO_piBwTWjuchQ27uusAg@idS=JbIIKyz9#j|dRxq=
z`@5@liUla%+bO<6>l^Qg0BuU|zAH{*tmDpIqND<^dvB6_C?CH{>XllyDe%${-pKS?
zuV013L=Ij_>Ts%eiNZvxy(!)9A&ybqBN^QnXQGW%XyFXBMVV;B6k1jW+JaSy6gS?e
z6$&qufp=*pxj_mon1MDg6HO|#KnB{JZfMSV?~DA?Q$m>C4WeXNq6qjh2+T~QIrqHZ
zO@`;D+r8PuRCkfly*?8yqR@B-T5Tp;P@%C5wC}DOks(7};lZqWEPQpEoU>t{7@7Rl
z2cjD9!#Mx=Pz=EhQs;-Fr1YajEoR<wnTI_BuVj-Fky=!0d!6B}qPTQV7Z`8v6PVVy
zQfqVO5_oqPn7d$Fw@SQ5ct@)^9Yej+D*Bb}16J;Gq?+hN;i8e)E47cDZ;8s@E;!!c
zra?%omD(O>_-=sPy1=>%COK7VyPdfJ!@Xa-TTDxQoMI(}D@Ie_f&+;n1)BgavEWTJ
z#lkq3g)V?Me^>!J5KmqQk0+=-@zmAs2?TnM_EE&8*TLfwiVeVtB7!EPz13-IQzCk-
z4cp^BXIq=M8lI#)ZVD9gUZ=CoTMZBF4Guj5tv07}kGC2g*jo+gg&OLjJ)#4*yVLfH
zlC=4|&LWnfGuNYs>nuElhM3?}ax&0JNqV3`YQ4Cg#hGX$S1lsEXa+e_svhJZRG?+L
zq6L{~5HZj)T@mS^=kR8HBns2E-iNcubUj_p!np_C(*h%%?ZIS;8a>n3%*;f?5`sy8
z#*|6Ldr*rhv`oiSe}ccnl7pO{L5@tK2f3g^%XCEF%|wHSfEq_8qx+Tt&!MNNB{%I8
zqgbK?A6j(g(`%|6;hLF^k2a_Ur&=^ofNAEWnz(wIm6~FyCh#Fe=WZss`plsp+VSy1
zVP_5+0@tdxa<{FhvzV~5-NycIV{vD!7%_@f=(Y|(t8-Pmxa;&TuGED=9spfjQMSa)
zPc<p7@ESvtSwP(<e<FTo(Dx;n6ego;m<%T2R=EW8UpV1U#ex(XsHf0CHH8LhUpT8i
z6}85fX=<wGQgf?{+LulzQS;D1ErkYZDKt>~()sy*G08ZTrlx8xHMgp$9dbU~FRm;^
z3qD%IU$3A~*bXHd4~UJd;P858`XM&owbs`;jb=gT9DdlTIw%IQ!_E)fqWPd`D#KdP
zhi{wR&)_yQ^%>k|CQRLKJS{(Id@h#r?{t8ekOiF=399)!qat7H@lO_dus~;2V?BYg
zh+xT%FT@)K_)f!l(XS7p@>9P)ptgC1cRr(#7r6@SMKm;TQHRUTsII-^1y#Qf^`UgY
z+(K{{AsoK1jh&=fKM#gKNX`i8a-eyUoPK`I_a@a(pCzxj_=+E2H{qm9<qxm9bpDT<
zsCM}!*H1_;7^?p`$~7(d>RA10!;cAte_QI@bh<vydFgaLmVEsJeO838dD;2pe7&yd
z>z5hM379MRTeX#inW0(tZFPQenLawY3(r0uU_QL)@BHO5{hY$>udr~Gp^1J{{j7Fp
z$b7xN^p(G`a4t3L_z}fC{0b2AJ9o_2&k4QwDu`o#+nu)g`eeTDH5_F0lSA*lhUSQ7
zee7JgK#ymw-iF7RX06-iBo^ppp}*5}o@Tx0+`B*@^qpV*m4&m*K)0-n{^DOqv;K@m
zhJpT$vapyM)t^uSRlxx<(MdQ(nzih2$<R;qOC#)1@``KpMf5)Pd)Mk`u`N#gI$Th;
zJ7-;|58!JKIP<R4S5Djc8G12%7K<`h-WpT0rYX`t2;~^Tb0D~bESwcZTLiz@t@s=q
R4K!K2dKwPWYp3+@e*yY(=RW`d

delta 15672
zcmb_@349erws%)m-z9fRZuX6Yz70x5f*=S{7Sk#aV+0%-hj9?`={)cT2BNr)=mlk5
z!yDdfwD^3C21Q{6H4vb2)G;873n=(dQ5gpi5Cst(1oSiV{-^rhESKcH?>D~>eo0l=
zId$sP+3M8k&c2hGe?6Aj7~nk2>=0kWkIid(gL@eNzYo}Ni;--#ovG)sb#|G4wr9z=
z%vxe!tuJCL>|^?v(w+Zch8U?=8r(uRX0SnAGgz2e%YeAdUhK=U>pb@sz4|RH1Hc1J
z3w=WX&1wWnqkWyXV5qRxDTGmBueAcB8i6sSIvK$P)bzOo0B{Kq>Ix9JJi*p-?=Hm<
zEz>1K4_}072Ca}0G&JipyAsHnb&AzlTfb$oDCcnty9B$YZUNB}x6d@Sx)I=fiG7oQ
zrb+vZ>O9UXfr)JrrWcrOrpXoBGW$pWUp*Va+{UKY(xRL-+kX%K2U}v_5*k`aYnTvG
z8Q=LWGkF>Jn@TE6>@A@x2sIM!<Kplnj@Az6m*K1Zic4m#vm3)jAo@;tXm65;k7Pz$
z^Pm$729vA>M}(OXoMQ2rSn?ls|MbIbnO%}`IXz6tSWZoDWGq(x*T|z`%%|;mAQLR`
zbL56$M3IzAIxsjokcdAP)k0}O!%He8e119b(Koo^tu+-ov~0a<FUlMe|LHj40AcR;
z)CsYm_0ae1lK5Z{osG`yhS~V1(<4xgfE9{aGwQ5e&wuc|=T=&I{`U!s*mFG5nQa73
zh*7s7$Xe4^XEMWkW!P`+R!f;4EVk3_`VIS<tQ-?F%Qel?Nk7n=`cH<aHVf?i$L;#8
zi`Z;?OICR}L4{b}dE92%lh|DQU$O^yY56_J?f=XkRsrgX6$~BfBrbQU1r3cvfxfb^
zAXny2s|a_MNKUW#YIj`?+-nBEvY1Q@V@kR~ON(mOA<Wl_=zera=oVR1oH}o!<4TmC
zjwK`_D#=i1LJmw>u?Zu9eHqFccA=wLWOkM@Dy)?Tm8x{BtQI488obsb3D(ja4LaMI
z6ZJG>m3sTfoW8}%TY)7MmKJlh>$>ixxxEsY({qcuyW2AO745eJi_l|H4+(f#ZaGKu
zQSNz{kg1%xzOrCi4|{gkLC<HgTLy8RBo(vD(6e0Ls`APQ9XZa<;o4vx!LA`G)%RN%
z#j`&~yE#*k$Bx^}^KOhPC(_znoxy>*a&^~-m)JW>itNktuQrssGhPW*CY;>^Ty3VQ
zj0rLB)%<nQgdb>RhAOau-&9XHSJ#79gT1lfs$@9SI|+y96&4r4)nGD)0+ZXRXbDF|
zusg3X-@d!hN(8+1g|Dc9N0`{NionW85y=w~_=%zyJKe0Wlg$S0e=8pAZdhx-VI`cI
zrY8~IelC8e2urD8456u2$ps9JR<%Fxwyo0@Maqh|6}E|%)J!r>2GHwE-&ad;+L-5z
zC9decxreL4#L*CCP$e_`Y6+j=s%D8j(6}H#0#ug0qUly+UIaeQ`u~bO)b#8A=WyR=
zpJ;|Z(oJ4m%JQ2U1}qi)=lxBW4_<vEKfSZ*^b>cm+~!TJWt@iJiH6gY{ph0VE+Ahm
zs_X)C@t?Yy^W>kpnqw`?v3*Yt>Vjeep#S=0pDv(FpX_Qq?^EX&V@I(*nzm{|1u4!j
zjF1YDxA)PSo34K98pa&^`394%wYN2tvNiURh7#lfl}mE%?u-4b$u3)5!dBT=J)N88
zSF?q+(w>eda#X_*IM?r6d_HKbSxhu`EhZW#F-Otp)h~h7bVQ3`*`@_%wBCq9(f+#P
z3f?CHCHf_d=$5cXd*0KdK<|U6iQcKFiC+FQNqS@dM|w4_^y)BUrTs_zZnR%`hWOj_
zj5&O5Di_Tfv2kZ&+L~0bLC~2AC~c?&0-1>4u0W<FAW9(iOH2G%Y~7M7zGsd-=g&oU
zw`cpK+cf6cEY?v8UI@<Dsswx1b5&g^_T%TeQf$T2t_*43(yrzl>2i*>tgAT<%eq?U
z`!45<`}4W32(F-J%(Hn-`~S=t+hBk3e6JqsUt>ul@WBFKWv0(kV$LxH#NsW7&-Gtk
z#NM!LmK(^YrY)a>nIA6iJ-o}A16yadqisb<cm0x}H|*gnx+!wMuwPh_2a+3C^zNFO
zamjhjcIFGjOrIB$v;Xr2n!W0U4zqQ4E4sUtnQgOAtJy^_((J)6K9Dqxk6$FyP_i*!
z;_TclD_W&eamfV_EwxT-b*9%Pnr3NL*vc&OwsLK=J*F|rHraKJMQo#8-`Im~u^Sun
zvq_RjDF}0dE62_DCkpc?g_*yyN5@rqB~}T!tAyRTLtWF{Z2xv;wA-dOuEOrl1}ck(
z?QXQ6P9<n02xhqiLoUH?Am|~sk98y%N@`tm5jNW8t4bgw*3`|OylQx&w;|EHmNhp`
zXxbw%Jj~g^npdqG8eprNhHco+!;4PR8A`ctz?yFtZJN>K+nmomD^AiW+T+b}Y=?d4
zn>YEkZf4Q2wYur^H)9M_N58$6wVN8j)Xc5B?MF71@$L2Y>P=TSeZBQzKM?Qu;4mvr
zuAv!0Yra|!>6J?0w%Yf6G>C6MVXyh9CpDjb)IWdw31q$tus(jk8WJ+fP!`%fb`2f8
z5X1O!hgCj_jLju!ve99rxe0JhSQC{=)cH-1?|M$~o%Qzc-8WFPc=z?Jp^5!%KliWx
zny8sU``!=oZGF%8(Kcp7?U>zqf>{CL156g}?UlFd1X3<sI4qot7QdpHK)X-azuH@p
zx5ouV6i9bL`%l=90CdO&6)2F`1syqIzXs4T7i1`q-vu2%VIKqNlnW|Tpnwbd>4e>5
zUo?02NySD#1q!*Kx%RL3mF7O=LMjv_-G$WK_4}&2l<(nv|Ga!Bem>3vi8dV?kdyh)
z%_eSiP(^VR*=re}+jR1$dpLsoZKnsNHy}IfN5YEZy4u+E^63y~4RSMM*RZ*=gtHsN
z?rp>HQO?&W=W#aBR5vD?q>^lO=x&rq{uJH{`8j8ERSM%LycM!euxgLGgv(~Z-0Y7~
zlX)0rI)*2i{W<?fXAcKPH?Ck)4a8N=NW=MJB1@|4qa_C^YvQ-J4#fzOV@yvO6BzR%
zJ!q6OGFa|KG+p6(I+^e7WWKL;{_B_@Z=Ijk$^1Yk^MlFx$%R9-8_{kzik)yL<kHV1
z7uPaU=|za1o=PvX6M9)^($lh2$>k6^Pb#_G1i6b78c;mvbwV#cm0khS^O|X{retO&
zE~c$QEHcv0P*^d>l)^}0YMm0qsZ>>$Vk}kF-7%(SI&(8w3-7b>G{usr-)dGJgH^%c
z{E<RLB{4Hn;sV?%!A9qW9CoK*8=bQY*nUs;qPK9bni};QnTFShklQpkC8eyCu>o>u
zclI+I;0!TX&k$QJC-q^siSsS59M+c=$YXujP7H1;XScEQ<&eJYD4u$sM^8?}c}z#N
zme2NMkKu7hf7X(Q#|wwB33z;b2>TO32M%MC@wjLh%c94ZhO-GEpCzw9#dDn=7ZVSw
z<t3Huw;0WFM_;dGpPa*HJ5R*ej~=v-&0vEwvKpqgT*DitGWDC&Z~0lA)pj=io7tU@
z@9JXwmvcJDf4__IG52+jzx6&=*9kw)nERQRbpnu+=R!CQ&U<s&V9^=8;GxbJdaR4_
ztcN?t|6>>9p+`E$fA|qLtuuc7kFx2VjbnqQa_OV&NM}H7a<JT8&-!7f;d!i(jHK7&
zY&H~Lq7*J)f1F(i6a3+EwuG@)ohKKv>G?oPZ&Y?Rw~;lZ!}`v8osB{>?R9n$fXiNI
z)7WY!ubI8>#p8#Y+08Js$}Mc6&IZVj-(mOSar{<xj=(zS%BwzLInKiOSRX*0P5;I2
zXMERgS^WX~6xe5f$hu>A<cIJQw3BzS;a;@Qea7+`W)GBBDa(}~{GDBo6))Js*6@tP
zId1YW$AxLW{9zxv4^w7+&VDb@o;=8Yt>fsq>M*;B)yuVqnK9&eJ?f0O<yoikPt=Vv
zwJ@jB)cm6=;|PqzfM)G<GX|srsxwjz$;=~glxErQ2rH?oKZ>f5nqwwt8JfxvS_wrp
z(Cxugl?~;9u82N%gc)Y@*GN4gvn<vF^@WjPQ4|cSnTaKeta|68BWxC9>*TnjtY_lX
zyXPoF?O7Im!`$MtMu(s7RtTt0Vu<A7Cm;NV-5*aJyRDUHRjX6wE4RpkvY4t_Q;L<S
zG$oGU|D_o1Z!1PCnyNl?22J_!F;<aaee*Hau0Gv4&tTUrP!m%qP_smVnsr*B78auz
zi~==t3)C|H9h>P^aCrL)4pp*I9`v~t94Gr9?7_5N%T&S{4O&awPRMFd9m7NMJ%sok
zj1*c+<ocgj_W(d{8D*LL>L+#%%rU2hWwUl^Ww`USQda%UdKr{gdfmLz*OphRF<c1@
zpMAsLY?WaGi84%|)tX_-bw4vV#RQB}OrO=7V#>n=qZ|`3$}xRbYmO;PPXk6tCSa6g
z`mEL@Q(kqNxmhM)lx6y?WR@u(Jk8uR6Cg@6eO5Bflz%zRy38{lE9C!lp7~fk@5ve-
zTgj*N+NpeX5btTB=N8;q&?M)zMi#^E=1dx+$E^lg$#^l`d;;S|h;DZ<{-i&!bvui~
zj>~wn(>=g_0RVdC@TruEF398O;c;Ugzm=jxMKNEG=Eq__4Wm=K@hJ?CJ4*Od{M38$
zmQwyaYj#Sy^Rb-GcYYV;F=WB=<FmO0c3m%iEl4_B&*29cn<H19%YTI?(1%l;9ny!d
zrqN^NJQt0(FE2%NR$qQMn#FzjvuH-0$89v9p2z=;<`4b&1!!LB$EW4ad6!W^=n4cR
zmJA0C&anRcLU8TeaRJ|%#TLn$Dqa)9%|+GNDUL~$SE_K&l1HmJ-M3`U<$OR_7Xmr)
zaz31{P|0f)RmLg1PTsc+tp?Y|BpCg6H!kHMRbed{v?n=(WOg+_E9pmRNk0m7@S_0y
zC{6j%_0_zCA3apf$&Xf5^Dnx-OQN2SyJV4kw}yA4ogJxhcbIi0@A671uf6i0w?(<_
zN?yoY4#`7T@>>x3e>0lj#<xEr-yhA-r`9`$M=@MBhMWDH@8dBXLfaqWq_vAQTxz3m
z?<h_)TyI03AX<gP07HMhyn75U4R;)RZVVsXt-}ye)ylQRqz&?iG5nIs{|O^8fxD}`
zk$4oukRIYe#fH3lEI&65RHA~ok}r+r)mhDmYAE~1aPdNQ1`9UJ;8naVi1C=EHq<PK
zUj=ViC%?Rk&*aDE$m@RrIiR(F!K35X-G_~08Jv2CFY!tcZ)N827EQ-(t0lwCYUyES
zwp<<7imZq=`#u-YZqTsipE=mzgs$d;+0e%OIOK|xxX4-$a@NYW=MD6{s_pp=dTwfa
zUhTxk@%|q6id^|?{(HXbfGoe3k7FC<eb@5q*jvt?Yx$KC_Jq^7mR}0rcW#-=|HgSk
zsk~}BH|3^Ee%Xyd!&AolL_^f{i-rwPZ$2<uKu;CXbZUl1GpMPIMhJFUw2+>vqFL0`
zM6;<G8_l6+Tr`)O@zFeLCPedT!lY=C%(Qr~@=W*O?}5~V*P3ZPaf^=P-Ry{{NM;js
zsvCF_s+sBXcQg2eP%tELIKr*8Xi$DQgZB)l6TBP!GE7~1>bMb4AAvlFM*PtnydOXi
zE#s3+-w2YYdKi!E8Jt}``XYl(F~Zc{Ct>P_&%mJ>8Dfg!s499GqBw|~3@UGf_+>^K
z1?NilX{eEQ3Eo4Y4rXDn${<|~jNyG<je(ifY9GKA$aMI~TXZ)=0_#2_9Xy)?Ohk2S
zHcEMhUTlW&t~OF)21bNol13zEqQHljkp|U50|t)ENF7b7j_N41<26xn)Tee?&`O9a
zLUADIxOVwKA%>kAF(XtR4F`>I2R;mnP#SHj28=8I)5`cjla|yXg+c3HqzH`{Mh1*P
zRQ6!sp+OvXZ+b!LM}xO^IvOt`D5=rN9Dxm5Pra)d@w7-cOLMb3ZxI2*x~pkY#5h=#
z-@@+j0s$sbZR%JrOhWKue6~?j43;5?tIaU90vOn&h=Rh@AQUq-BRq(YgKPVen?V+c
z#fqc4aOG>n!nFOEl4r~e0*<PEAVyY1)fAUoq@Li{NH<UqK+f~4WIF0o3<H9<Nr+{p
zKFajdnUvfBy_6eIJSox0#kcb8xDp2L{YGRE-btombI`n!iK}*N(RSpg5DIT66g_ST
z4~bxW(8VThkm$fT(hTh!QE3*B)$fhW@EFCx!2)kOdh-ap5KRsoF3sAm#bA^fWxVIm
zs1mP{X$iF_vfu)v(C|jCvB>n0WP6HS9B2rRL#ra8s=-Y{$gy|vk}=S)ZheJ_<DrPC
zTXPQiYM}=Il@>y~;h$;k6?Mg7n3Z+p-!p2{g^n;6R7Pxw8klPic9Uc1k?9_*<&YMX
z@7)DAjQUZ#Noq-@__T0MG?0*uuDA`sw*8qpBDTva6p)lP%){_jn|??pjE@fZOfnjJ
zKMpIvmlrV;#y4JJWAC0;eZoDUPfxs^uyHX_qJn31CPzOU3LkeQmFDdrMnA;pP0~t=
z5tS0G?@F@DwGgs%=CZDABMy2UTpvvOh_bbCt7csg>1~k-E9E42+*wo*ZdAwuagJDk
zjpQIYWWm#cJqhy>6}*;KjkR$KdyOm$M-#!~1f*-4%cY5$KB5hFI-5hBdT}kn#vU3%
zh};AF#E@m+s|v$A28(!Mhh&^&4>;Oq(uWLKHH+d(Evdo~KLv0Ax)Ef&r3Fd0<lcEO
zF+W(1lO5uxsu&AtH56e*rI80G2Q@FbBWPtQxAcwzlnRoNDRe<kpe)FMP(GwM3Q6&i
zMlQ*-nEX~J%WCKISujm71){B5jQd+H0(*l+kS#*oZb$-+HV?-sLn?p~!x6BFf*``B
zYq35f9HVF@XnX=<Z|l)Sv57Y9#&3v~LYLCk)n*<}ptu2{x*;o%LROm?^GJOO=}{bo
z^hkKUalRrV<9vm#nzhkU0V(>>gSteqnO%#Z?=!NiaWKL1e0V)TS3M&egBZ~*AHG3Q
zPe7_FHsWYvwN~Zj7}c#=b;y6RV78D_6lY1)3uUM%0#_PZr!-0~W`ts9q($ReOAwii
z*TQEmUeZWFX$0RUP|z@fv1pKr)+BA{386PZOSdj|X=NF4wH7o&4ok5VdrWwZPi4|>
zxXZ^D3NO{P*%wd50S(1w(+`XWMvWsHO9TaN)8#d2#c#!zC5criq+>6jW`vA<dS62E
zkdAY(8|U}n(8P{#2#VgoEWj$7Hqi`0afpwPk!cqi1q*@*tHLP23-Mr*K`&nD;Ed@c
z5aW<<7l^6;`eth&ZZ*9$vb^CSnvaEwR3J`E`g+uxh!O1qF(xL;Co19bnIamJGw<R3
zLIK2J6S)LjIv`)Xhv&ybD(sGi8gfP;lqTl|;6FK7X*#eET^K1xP>xvtNOpd9JFu-P
z<QHZxkkPM+<`OT6O9AC75ja?YLUF)l*55G{i)u5V$H|4s%423g9sUNaocxx~#bhiF
zwtAjdovm1fJTG~+cA;^lZwgKLAFd_?ASVcr19**qO2(qzcspb8o?#3D7z1{v;Y`UN
zO&!<(nFL-2DvsNmL2f5z5J;FoByqmDW)LFdMe0LmZXBlUns_wOsVPL<gGOl?k_{<Q
zcoUWo!C4YY3L|L=V3x9|6d!?p3ZSgE+{4Yq;!qvT5|McjKA1=|f`fP&8idJO6c5lt
zierFPqwP2g6ARW;@4Gn&C4n6doh;;fiC_nB>zEBzM3Iys-$sfYGjXoOBJmPRP>?6W
zn|+WV9SbU9R`Cp)^^Y(a`7t>&bp-NIDAe!9JB}xH!J;ipG8twgn@<qLVM5_>5;72s
z>);FisL~WIvE#8J`RHt(my=`|D&eu7ni0$hYSI|Vr^J;!Jd~^UF+7wj_Co{T9*8D=
z5U)U!F^CUEgJccg7#W2X{4z9Il)rn7Y}&ua$Qi`PqRB<vMuYG@9!)+4bB|F}!6)_L
zJ*c#5?bp=HTrGH7aDFdR4yR!bzlq1)Qfid7x+k42%wwVU8JQgBiBfA2ADvd>9-64Y
zLMtdnBx|#75f$YK^}ZNw0QXH?YOVdMOdI7mc{5^If;?EL<Qvop-{8*ubNS%_-?LrL
z_#+?6k8PJP{*gb2*12Xre*o{zoxj<9K3)HQ_ZS}>=6`?7S^6}e%=xixPUKmB5>+3^
zznl*Y=sP&`*y%5ikr!8SZ#PP#)Vp={UI-jS2IEx0gWvQ@wwC8+d1DrvY0XAF(2>D&
z9D}m|T0USPt`|BX;-ZnzKORDa0G3huOYFl#`+yZ-51G2$wM7iJn_n(;*01H$&JkNr
zGueL+j|Fy}PJ9o#Q$D+gPwur&Ndzury*SL8KiGOxy{;;bx~V=>ve|>l1NQQ$7acN|
zy+>GgdDmXvqoPsOc1-MsH0mZ1rbc-zs@(XFk+@DBjuZz!b9V0K^O(S5PW*GefrZf{
z|IvaLaXR_{AEa1*@Bpty747{4d?K3u2YG)qHy-5Migu}*AJw5N4CM(_*cuqLcGWx8
zU+{%I;vyJqBwd6sEWi7bFGnHl$*=gGY>mu6%txoq{{(#EZsB*|Y}|jCk7Um|Zy)CU
zgZ$HdvgHJ?W$((HPx7z%?t^mtDSlDO=lgN|)9Hsh`JqBH1M_cyqic~WtL1=v=@g&E
zUY31+;J@SFd?XkBz^Af5IB7rf>p5HQ-24+?!P#0T?=*j3XOBBy3UQPdt~<=k3ykw;
z85hLO^Or)Fl#SuriB0mDSG)vj4L<QQTkKrx7yCGCa)t%PK+fKAZV8LM=_oah>n7?2
z-?c;T>n=WH_sjK$n881K-x&}Uc-7wQtn4A4FJ-Sd6D|=m1MJ_O9W|nwvF%Q`(c&H*
z@F#vDF7g-SrDGHqe^JJlaWls%r7yg56eEk|H`j_1NG|*varU`OwlK;RZ>Kku99O_x
zg=Hm&r=rS{Yvf^~wZxhD8?gyhI#8y<gQ6UADN}Ay#uv%q*NMJtiJW$wIGZhT7F>so
zvPYcJ*Nfo+{=vu2zuhdBGPXkwsuh!|y!KSBC}m5Xzt)Oc0Uh?9E~bI6h0{d^KY7r3
zf4aDnimX@MB7z=%;tOZ|9kkP@<TH1Q+3Z25=UrmG;9nh*U(6C4Ah_r65hwk8&tB)5
z2SlCD_rB}oJt|&dOgit^i(lrktxnl9VpWKr{4crtB{7O0|4^RQC=9Y%S=A__>F}LM
znp+2@u{!{~7~has5qP0yMaC4nAO5;;pJ0S;u@ojr_l*Vgg|_t&|0#&Tg_0V*ys#EJ
zA2y251Z#BGtQO-rKmD1^a72~zg=-ygehA*-7PyP7b<RRZ{6)DAd6_m!xz6=3i;CVm
z-IvcuR3d{7#1qZ>`Lw732LQMqSz8nb@FUm1EM6G}?m;4@c0}Ybii{9%hqj+)<r*@n
zI`oln)3Yk#5k=*=ikSYI=%aZ3?k{3QGUg1BFTN%Qw+}quye3SxUG8{A{5H>Jl^AW$
zrmT4t@7(^svo+wg|2MWKIy3$%PQf<j$=DXr*L!Flw+_vdPi_%KP|eF*#M>DZ?*H(S
zI7J3`?<Zn1y$d+_shEOh%x-Z4qa*(&d~{5V{!E<3;`nxS6_OQ?<wN$zd-GWfkosld
ziv|V2z|aMTUaqC%7NpY$rG$>rJ5$jr6k0e1ZDuOkK!p}+MVsi7n~{pwPvHey@m#d2
zsc3x^S|9~&N-LTiyGP`@dj!)rwT>rt(Nj(Lr%a!eie@M@WbhqCFd-GKn?m!Zpp9RE
zqEH8k7AQPV3f{O>aybf3PeB`-iWX65A_c9c9h$sxugJbIsky3l5T#1LBEV7zT-J&v
zU)n2jbCT05TgNvsxy`Hhb-epTB|b5hqdphsu(h)8b5RgqGquUgs>j__eJ@D{LzYmk
zt!^8~*PAp>nlIOy+Q#wOFpZO{%e7T)<GMRex-HjMwvBUloOD~RHM--{zh4XhOGEdI
zZt=Ak%dE#oEBJE*d<;y)$17xFG6BH%sWeVHaG6YuCzvFmyG$m=6HJnnT_zLb2_{L#
zE|Zz{*=pqkmn2niDZkn;MuWc#4~UW?;x8eqB!6%wpskTJ4v751GiBCKftzFlfzK3C
znY+5&4lrw!hYpAfyF1CfrXgIKdNb}+YR9lyth82E9ux(IX9@&12=HoIcMvO)k~;1`
zq`<4>Vgi#+65zycbp>veI}VCtt`p+@?f(*JA}MxwCunr=9>@mqC?f@p%&P<1K!w&8
zwj!y4MvgtyE;Nw2b(r5r&2I}-?sT%g4rq`yxJivuWQ-lqdMdQkI5p{v(BMXzw3N+t
z5ouE67Fl(NRSVRrsd0;pJc%Z6{IXqWATv)wRA`7O0;!RTtlmYFFMTO;|0Nk3$m%;x
z_p9ltp^1E;0~*W(GUTXswLf7HX!Il+1)59c&0mS57dGPy@Rn@)evTtoGt==kIgQ|U
zfg#FJ&5Yy_S6|uDPy|DA{$Y{dn?9n>ZOJgW<-1*df?Wa-#`0S7k-%8#?qDh5?>H=S
z@N)FvVX>&)s#GR1vyw}wRXsG6og7lDs_JSEQF-NvcqU38wu5Oft9oD-OuAiefVuUu
z?x@I?|M9i>WfB)lBymAHi3^hJrT?g?if(8nsd_F+x2s5QAd*v(XrPuv1GOX?sBMrx
z9u*^_Z?sZVJ(rr>Rn*>)H+&<e=VAnZGK1fVhW>!&4d=i&VkOJjxPqao$Su$9)~Sp-
zGpA(&-zcB|R`kSs%J<L&3Bl76))>#0^N))>ne&|(mkZ(3U*Wv!{!PxSO#MxcbI*5T
z2@f`72Hr!oTql}kpYKI+FB<R>iEOR?KO>J&0Q0va@67#PtjfWs-m)T~<3oaaIWeGj
zui9SE-3uLM!YDIX3y?$2Zl*h)Rgdnmc^gz8-&t9`X>dNlU4&rp_+ECFW<B5!zUXWU
z=td7G(pR4lcIt-cYolIp5d2f2j1JQ)oZnujPY&@zo8-GU>(zO0Zf3~*!7>)zk3NfP
z*741<Z%jWwykZMr9>6^Kc34h}=~v`7yvc%Lh9UYf)Q`1JZj0$PMGN0z!Au&~@k11|
z@ROritKU*9r7d_H)WLOwe4thz#lL-9zE`WCowocP42Lw!kw4e!#pMU+F``-DqRq`9
zYV=D>i_mWkzX5w{D{!(jYq=abRquJ}K0Kxu0a{c<zws|kv+Qla!Rk+;&CRDl^(!rh
zx?qbq+eSEfn)UHE=Y^^IO(FJ%lXaIql|I;v|6aeC&6ltLUhi=>ZkxeE+8F&ywuX8l
z8CZt^e}u`%-FksE?$&3HUiLNUGW>xa+RS4QMAW3@M96Y0!80Ilw5S}T&Vrvk58w||
Q2|(ID%7T9SijMyMHz#`rk^lez

diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c
index 281fd31..98a0316 100644
--- a/wasm/quant_wasm.c
+++ b/wasm/quant_wasm.c
@@ -61,6 +61,14 @@ static void on_token_sync(const char* text, void* ud) {
 EMSCRIPTEN_KEEPALIVE
 int wasm_load_model(const char* path) {
     js_on_status("Loading model...");
+    /* Reset generation state on load — if a previous run was interrupted
+     * (page reload mid-stream, JS error in the token callback), the
+     * busy flag would otherwise be stuck at 1 and every subsequent
+     * generate call would early-return -1 forever. */
+    g_generating = 0;
+    g_output_pos = 0;
+    g_output[0] = '\0';
+    g_stream_count = 0;
     if (g_model) { quant_free_model(g_model); g_model = NULL; }
     if (g_ctx)   { quant_free_ctx(g_ctx);     g_ctx = NULL; }