Lothnic · Lothnic · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 future_plans/
-__pycache__/
-*.pyc
+**/__pycache__/
+*.pyc
+
+docs/quantisation.md
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
diff --git a/engine/__pycache__/__init__.cpython-312.pyc b/engine/__pycache__/__init__.cpython-312.pyc
diff --git a/engine/__pycache__/generator.cpython-312.pyc b/engine/__pycache__/generator.cpython-312.pyc
diff --git a/engine/__pycache__/sampler.cpython-312.pyc b/engine/__pycache__/sampler.cpython-312.pyc
diff --git a/engine/generator.py b/engine/generator.py
@@ -12,7 +12,9 @@ def __init__(self, model, tokenizer, sampler: Sampler | None = None):
     @torch.inference_mode()
     def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams | None = None):
         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.config.device)
+        prompt_len = input_ids.shape[1]
         past_key_values = None
+        prev_text = ""
 
         for _ in range(max_new_tokens):
             if past_key_values is None:
@@ -25,5 +27,11 @@ def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams
 
             if next_token.item() == self.tokenizer.eos_token_id:
                 break
-
-            yield self.tokenizer.decode(next_token[0], skip_special_tokens=True)
+
+            # Decode all generated tokens so far and yield only the new text.
+            # This correctly handles SentencePiece space prefixes and multi-byte chars.
+            full_text = self.tokenizer.decode(input_ids[0, prompt_len:], skip_special_tokens=True)
+            new_text = full_text[len(prev_text):]
+            prev_text = full_text
+            if new_text:
+                yield new_text
diff --git a/main.py b/main.py
@@ -1,8 +1,9 @@
 """CLI entry point."""
 import os
+import warnings
+warnings.filterwarnings("ignore", message=".*_check_is_size.*", category=FutureWarning)
 import argparse
 import torch
-from huggingface_hub import try_to_load_from_cache
 from transformers import AutoTokenizer
 
 from models.weight_loader import load_hf_model
@@ -15,15 +16,6 @@
 MODEL_ID = "Qwen/Qwen3-0.6B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
-# Resolve local cache path if model is already downloaded
-# Passing a local dir path to AutoTokenizer prevents ALL network calls
-_cached = try_to_load_from_cache(MODEL_ID, "config.json")
-LOCAL_MODEL_PATH = os.path.dirname(_cached) if isinstance(_cached, str) else None
-
-if LOCAL_MODEL_PATH:
-    os.environ["HF_HUB_OFFLINE"] = "1"
-    os.environ["TRANSFORMERS_OFFLINE"] = "1"
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="vLLMini Chat")
@@ -33,6 +25,7 @@ def parse_args():
     parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
     parser.add_argument("--top-p", type=float, default=0.9, help="Nucleus sampling threshold")
     parser.add_argument("--max-tokens", type=int, default=2048, help="Maximum new tokens to generate")
+    parser.add_argument("--quantize", "-q", action="store_true", default=False, help="Enable 4-bit NF4 quantization (requires bitsandbytes)")
     return parser.parse_args()
 
 
@@ -45,15 +38,19 @@ def strip_thinking(output: str) -> str:
 
 def main():
     args = parse_args()
+
+    # Don't force offline mode for model loading — the weight_loader
+    # handles local-first-then-download fallback on its own.
+    os.environ.pop("HF_HUB_OFFLINE", None)
+    os.environ.pop("TRANSFORMERS_OFFLINE", None)
+
+    model, config = load_hf_model(args.model_id, device=args.device, quantize=args.quantize)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+    # chat = [{"role": "user", "content": "Write a short story about a robot."}]
+    # prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
-    model, config = load_hf_model(args.model_id, device=args.device)
-
-    # Use local model path if available (from user's caching logic)
-    tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH or args.model_id)
-    chat = [{"role": "user", "content": "Write a short story about a robot."}]
-    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-
-    # prompt = "Write a very long story about a robot."
+    prompt = "Write a very long story about a robot."
 
     params = SamplingParams(temperature=args.temperature, top_p=args.top_p)
     sampler = Sampler()
@@ -117,6 +114,11 @@ def main():
                     if remainder:
                         print(remainder, end="", flush=True)
                         parts.append(remainder)
+                elif not indicator_shown and len(buffer) > 20:
+                    # Model doesn't use <think> tags — flush buffer and stream normally
+                    thinking_done = True
+                    print(buffer, end="", flush=True)
+                    parts.append(buffer)
                 # Otherwise keep accumulating silently
             else:
                 # Either HIDE_THINKING is False, or we're past </think>

diff --git a/models/__pycache__/__init__.cpython-312.pyc b/models/__pycache__/__init__.cpython-312.pyc
diff --git a/models/__pycache__/attention.cpython-312.pyc b/models/__pycache__/attention.cpython-312.pyc
diff --git a/models/__pycache__/base.cpython-312.pyc b/models/__pycache__/base.cpython-312.pyc
diff --git a/models/__pycache__/llama.cpython-312.pyc b/models/__pycache__/llama.cpython-312.pyc
diff --git a/models/__pycache__/qwen3.cpython-312.pyc b/models/__pycache__/qwen3.cpython-312.pyc
diff --git a/models/__pycache__/weight_loader.cpython-312.pyc b/models/__pycache__/weight_loader.cpython-312.pyc
diff --git a/models/attention.py b/models/attention.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import math
+from models.base import get_linear_layer
 
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
     x1, x2 = x.chunk(2, dim=-1)
@@ -19,10 +20,10 @@ def __init__(self, config, rotary_emb):
         self.head_dim = config.head_dim
         self.hidden_size = config.hidden_size
 
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.q_proj = get_linear_layer(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.k_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.v_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.o_proj = get_linear_layer(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias, quantize=config.quantize)
         self.rotary_emb = rotary_emb
 
     def core_attention(self, q, k, v, q_len, kv_len):

diff --git a/models/base.py b/models/base.py
@@ -3,6 +3,19 @@
 import torch
 import torch.nn as nn
 
+
+def get_linear_layer(in_features: int, out_features: int, bias: bool, quantize: bool = False):
+    """Factory that returns nn.Linear or bnb.nn.Linear4bit depending on quantize flag."""
+    if quantize:
+        import bitsandbytes as bnb
+        return bnb.nn.Linear4bit(
+            in_features, out_features, bias=bias,
+            compute_dtype=torch.bfloat16,
+            quant_type="nf4",
+        )
+    return nn.Linear(in_features, out_features, bias=bias)
+
+
 class CausalLM(ABC, nn.Module):
     """Every model must implement this interface. The engine never looks inside."""
 

diff --git a/models/llama.py b/models/llama.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from dataclasses import dataclass
-from models.base import CausalLM
+from models.base import CausalLM, get_linear_layer
 from models.attention import Attention, FlashAttention
 
 
@@ -24,6 +24,7 @@ class LlamaConfig:
     head_dim: int | None = None
     dtype: torch.dtype = torch.bfloat16
     device: str = "cuda"
+    quantize: bool = False
 
     def __post_init__(self):
         if self.head_dim is None:
@@ -59,9 +60,9 @@ def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
 class MLP(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
-        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.gate_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
+        self.up_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
+        self.down_proj = get_linear_layer(config.intermediate_size, config.hidden_size, bias=False, quantize=config.quantize)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))