diff --git a/.gitignore b/.gitignore
index 36b9821..16c8f68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 future_plans/
-__pycache__/
-*.pyc
\ No newline at end of file
+**/__pycache__/
+*.pyc
+
+docs/quantisation.md
\ No newline at end of file
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
deleted file mode 100644
index 0d03894..0000000
Binary files a/__pycache__/main.cpython-312.pyc and /dev/null differ
diff --git a/engine/__pycache__/__init__.cpython-312.pyc b/engine/__pycache__/__init__.cpython-312.pyc
deleted file mode 100644
index 61e8cf1..0000000
Binary files a/engine/__pycache__/__init__.cpython-312.pyc and /dev/null differ
diff --git a/engine/__pycache__/generator.cpython-312.pyc b/engine/__pycache__/generator.cpython-312.pyc
deleted file mode 100644
index bee5af9..0000000
Binary files a/engine/__pycache__/generator.cpython-312.pyc and /dev/null differ
diff --git a/engine/__pycache__/sampler.cpython-312.pyc b/engine/__pycache__/sampler.cpython-312.pyc
deleted file mode 100644
index 947a252..0000000
Binary files a/engine/__pycache__/sampler.cpython-312.pyc and /dev/null differ
diff --git a/engine/generator.py b/engine/generator.py
index 8dab21f..b3c477f 100644
--- a/engine/generator.py
+++ b/engine/generator.py
@@ -12,7 +12,9 @@ def __init__(self, model, tokenizer, sampler: Sampler | None = None):
     @torch.inference_mode()
     def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams | None = None):
         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.config.device)
+        prompt_len = input_ids.shape[1]
         past_key_values = None
+        prev_text = ""
 
         for _ in range(max_new_tokens):
             if past_key_values is None:
@@ -25,5 +27,11 @@ def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams
 
             if next_token.item() == self.tokenizer.eos_token_id:
                 break
-            
-            yield self.tokenizer.decode(next_token[0], skip_special_tokens=True)
\ No newline at end of file
+
+            # Decode all generated tokens so far and yield only the new text.
+            # This correctly handles SentencePiece space prefixes and multi-byte chars.
+            full_text = self.tokenizer.decode(input_ids[0, prompt_len:], skip_special_tokens=True)
+            new_text = full_text[len(prev_text):]
+            prev_text = full_text
+            if new_text:
+                yield new_text
\ No newline at end of file
diff --git a/main.py b/main.py
index e3edf97..4049548 100644
--- a/main.py
+++ b/main.py
@@ -1,8 +1,9 @@
 """CLI entry point."""
 import os
+import warnings
+warnings.filterwarnings("ignore", message=".*_check_is_size.*", category=FutureWarning)
 import argparse
 import torch
-from huggingface_hub import try_to_load_from_cache
 from transformers import AutoTokenizer
 
 from models.weight_loader import load_hf_model
@@ -15,15 +16,6 @@
 MODEL_ID = "Qwen/Qwen3-0.6B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
-# Resolve local cache path if model is already downloaded
-# Passing a local dir path to AutoTokenizer prevents ALL network calls
-_cached = try_to_load_from_cache(MODEL_ID, "config.json")
-LOCAL_MODEL_PATH = os.path.dirname(_cached) if isinstance(_cached, str) else None
-
-if LOCAL_MODEL_PATH:
-    os.environ["HF_HUB_OFFLINE"] = "1"
-    os.environ["TRANSFORMERS_OFFLINE"] = "1"
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="vLLMini Chat")
@@ -33,6 +25,7 @@ def parse_args():
     parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
     parser.add_argument("--top-p", type=float, default=0.9, help="Nucleus sampling threshold")
     parser.add_argument("--max-tokens", type=int, default=2048, help="Maximum new tokens to generate")
+    parser.add_argument("--quantize", "-q", action="store_true", default=False, help="Enable 4-bit NF4 quantization (requires bitsandbytes)")
     return parser.parse_args()
 
 
@@ -45,15 +38,19 @@ def strip_thinking(output: str) -> str:
 
 def main():
     args = parse_args()
+
+    # Don't force offline mode for model loading — the weight_loader
+    # handles local-first-then-download fallback on its own.
+    os.environ.pop("HF_HUB_OFFLINE", None)
+    os.environ.pop("TRANSFORMERS_OFFLINE", None)
+
+    model, config = load_hf_model(args.model_id, device=args.device, quantize=args.quantize)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+    # chat = [{"role": "user", "content": "Write a short story about a robot."}]
+    # prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     
-    model, config = load_hf_model(args.model_id, device=args.device)
-    
-    # Use local model path if available (from user's caching logic)
-    tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH or args.model_id)
-    chat = [{"role": "user", "content": "Write a short story about a robot."}]
-    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-    
-    # prompt = "Write a very long story about a robot."
+    prompt = "Write a very long story about a robot."
 
     params = SamplingParams(temperature=args.temperature, top_p=args.top_p)
     sampler = Sampler()
@@ -117,6 +114,11 @@ def main():
                     if remainder:
                         print(remainder, end="", flush=True)
                         parts.append(remainder)
+                elif not indicator_shown and len(buffer) > 20:
+                    # Model doesn't use <think> tags — flush buffer and stream normally
+                    thinking_done = True
+                    print(buffer, end="", flush=True)
+                    parts.append(buffer)
                 # Otherwise keep accumulating silently
             else:
                 # Either HIDE_THINKING is False, or we're past </think>
diff --git a/models/__pycache__/__init__.cpython-312.pyc b/models/__pycache__/__init__.cpython-312.pyc
deleted file mode 100644
index f64a187..0000000
Binary files a/models/__pycache__/__init__.cpython-312.pyc and /dev/null differ
diff --git a/models/__pycache__/attention.cpython-312.pyc b/models/__pycache__/attention.cpython-312.pyc
deleted file mode 100644
index 7c0880f..0000000
Binary files a/models/__pycache__/attention.cpython-312.pyc and /dev/null differ
diff --git a/models/__pycache__/base.cpython-312.pyc b/models/__pycache__/base.cpython-312.pyc
deleted file mode 100644
index 9c4f2cf..0000000
Binary files a/models/__pycache__/base.cpython-312.pyc and /dev/null differ
diff --git a/models/__pycache__/llama.cpython-312.pyc b/models/__pycache__/llama.cpython-312.pyc
deleted file mode 100644
index f7662be..0000000
Binary files a/models/__pycache__/llama.cpython-312.pyc and /dev/null differ
diff --git a/models/__pycache__/qwen3.cpython-312.pyc b/models/__pycache__/qwen3.cpython-312.pyc
deleted file mode 100644
index 69214cd..0000000
Binary files a/models/__pycache__/qwen3.cpython-312.pyc and /dev/null differ
diff --git a/models/__pycache__/weight_loader.cpython-312.pyc b/models/__pycache__/weight_loader.cpython-312.pyc
deleted file mode 100644
index d7ffca3..0000000
Binary files a/models/__pycache__/weight_loader.cpython-312.pyc and /dev/null differ
diff --git a/models/attention.py b/models/attention.py
index c26a92e..e3767fa 100644
--- a/models/attention.py
+++ b/models/attention.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import math
+from models.base import get_linear_layer
 
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
     x1, x2 = x.chunk(2, dim=-1)
@@ -19,10 +20,10 @@ def __init__(self, config, rotary_emb):
         self.head_dim = config.head_dim
         self.hidden_size = config.hidden_size
 
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.q_proj = get_linear_layer(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.k_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.v_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
+        self.o_proj = get_linear_layer(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias, quantize=config.quantize)
         self.rotary_emb = rotary_emb
 
     def core_attention(self, q, k, v, q_len, kv_len):
diff --git a/models/base.py b/models/base.py
index 7e20ead..6481346 100644
--- a/models/base.py
+++ b/models/base.py
@@ -3,6 +3,19 @@
 import torch
 import torch.nn as nn
 
+
+def get_linear_layer(in_features: int, out_features: int, bias: bool, quantize: bool = False):
+    """Factory that returns nn.Linear or bnb.nn.Linear4bit depending on quantize flag."""
+    if quantize:
+        import bitsandbytes as bnb
+        return bnb.nn.Linear4bit(
+            in_features, out_features, bias=bias,
+            compute_dtype=torch.bfloat16,
+            quant_type="nf4",
+        )
+    return nn.Linear(in_features, out_features, bias=bias)
+
+
 class CausalLM(ABC, nn.Module):
     """Every model must implement this interface. The engine never looks inside."""
 
diff --git a/models/llama.py b/models/llama.py
index 84150f1..f8f1cf5 100644
--- a/models/llama.py
+++ b/models/llama.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from dataclasses import dataclass
-from models.base import CausalLM
+from models.base import CausalLM, get_linear_layer
 from models.attention import Attention, FlashAttention
 
 
@@ -24,6 +24,7 @@ class LlamaConfig:
     head_dim: int | None = None
     dtype: torch.dtype = torch.bfloat16
     device: str = "cuda"
+    quantize: bool = False
     
     def __post_init__(self):
         if self.head_dim is None:
@@ -59,9 +60,9 @@ def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
 class MLP(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
-        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.gate_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
+        self.up_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
+        self.down_proj = get_linear_layer(config.intermediate_size, config.hidden_size, bias=False, quantize=config.quantize)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
diff --git a/models/weight_loader.py b/models/weight_loader.py
index 9bae224..3da8023 100644
--- a/models/weight_loader.py
+++ b/models/weight_loader.py
@@ -1,10 +1,17 @@
 import json
 import os
+import gc
 import torch
+import torch.nn as nn
 from safetensors.torch import load_file
 from models.llama import LlamaConfig, LlamaForCausalLM
 from models.qwen3 import QwenForCausalLM
+import logging
 from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+
+# Initialize logger
+logger = logging.getLogger(__name__)
 
 # REGISTRY
 
@@ -16,8 +23,123 @@
         "qwen3": QwenForCausalLM,
     }
 
-def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.bfloat16):
-    print(f"Loading model {model_id} to {device} with dtype {dtype}")
+
+def _remap_key(k: str) -> str:
+    """Map HuggingFace weight names -> our names."""
+    if k.startswith("model."):
+        k = k[6:]
+    k = k.replace("self_attn.", "attn.")
+    k = k.replace("input_layernorm", "input_norm")
+    k = k.replace("post_attention_layernorm", "post_norm")
+    return k
+
+
+def _resolve_parameter(model: nn.Module, key: str):
+    """Walk dot-separated key to find (parent_module, attr_name)."""
+    parts = key.split(".")
+    target = model
+    for part in parts[:-1]:
+        target = getattr(target, part)
+    return target, parts[-1]
+
+
+def _find_shard_paths(model_id: str, local_only: bool) -> list[str]:
+    """Return list of safetensors shard paths (single file or multi-shard).
+
+    Tries local cache first.  If the file isn't cached *and* ``local_only``
+    is ``False``, transparently falls back to downloading from the Hub.
+    """
+
+    def _download(filename: str, *, must_be_local: bool) -> str:
+        """Try local first, then online if allowed."""
+        try:
+            return hf_hub_download(repo_id=model_id, filename=filename, local_files_only=True)
+        except EntryNotFoundError:
+            if must_be_local:
+                raise
+            return hf_hub_download(repo_id=model_id, filename=filename, local_files_only=False)
+
+    # 1. Try single-file model
+    try:
+        return [_download("model.safetensors", must_be_local=local_only)]
+    except EntryNotFoundError:
+        # Expected if model is sharded
+        pass
+    except Exception as e:
+        logger.error(f"Error checking for single-file model: {e}")
+        raise
+
+    # 2. Multi-shard model — get the index first, then each shard
+    index_path = _download("model.safetensors.index.json", must_be_local=local_only)
+    with open(index_path, "r") as f:
+        index = json.load(f)
+    shard_files = sorted(set(index["weight_map"].values()))
+    return [_download(f, must_be_local=local_only) for f in shard_files]
+
+
+def _load_standard(model, shard_paths, device, dtype):
+    """Fast path: load all weights at once via load_state_dict(assign=True)."""
+    state_dict = {}
+    for path in shard_paths:
+        state_dict.update(load_file(path, device=device))
+
+    mapped = {_remap_key(k): v.to(dtype) for k, v in state_dict.items()}
+    del state_dict
+
+    missing, unexpected = model.load_state_dict(mapped, strict=False, assign=True)
+    del mapped
+
+    return missing, unexpected
+
+
+def _load_quantized(model, shard_paths, device, dtype):
+    """Quantized path: load shard-by-shard, quantize per-parameter via Params4bit."""
+    from bitsandbytes.nn import Params4bit
+
+    for path in shard_paths:
+        shard = load_file(path, device="cpu")  # Always load to CPU first
+
+        keys = list(shard.keys())
+        for k in keys:
+            v = shard.pop(k)  # Pop to free memory as we iterate
+            new_k = _remap_key(k)
+
+            try:
+                target, attr_name = _resolve_parameter(model, new_k)
+                param = getattr(target, attr_name)
+            except AttributeError:
+                del v
+                continue  # Skip unmapped keys (e.g. keys we don't use)
+
+            v_typed = v.to(dtype=dtype)
+            del v  # Free original tensor immediately
+
+            if hasattr(param, "quant_type"):
+                # This is a Linear4bit parameter — quantize and place on device
+                new_param = Params4bit(
+                    v_typed,
+                    requires_grad=False,
+                    quant_type=getattr(param, "quant_type", "nf4"),
+                )
+                del v_typed  # Free CPU copy before GPU allocation
+                new_param = new_param.to(device)
+                setattr(target, attr_name, new_param)
+            else:
+                # Normal parameter (embeddings, norms, lm_head, etc.)
+                target.register_parameter(
+                    attr_name,
+                    nn.Parameter(v_typed.to(device), requires_grad=False),
+                )
+                del v_typed
+
+        del shard
+        gc.collect()
+        if device != "cpu" and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+
+def load_hf_model(model_id: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16, quantize: bool = False):
+    print(f"Loading model {model_id} to {device} with dtype {dtype} (quantize={quantize})")
 
     local_only = os.environ.get("HF_HUB_OFFLINE") == "1"
     config_path = hf_hub_download(repo_id=model_id, filename="config.json", local_files_only=local_only)
@@ -38,6 +160,7 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b
         attention_bias=hf.get("attention_bias", False),
         tie_word_embeddings=hf.get("tie_word_embeddings", False),
         head_dim=hf.get("head_dim"),
+        quantize=quantize,
     )
 
     print(hf['architectures'][0])
@@ -49,45 +172,25 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b
 
     model_class = MODEL_REGISTRY[model_type]
 
-    # Initialize model on meta device (no actual memory allocation)
+    # 1. Initialize model on meta device (no actual memory allocation)
     with torch.device("meta"):
         model = model_class(config)
 
-    # Load weights directly to target device/dtype to avoid CPU copies
-    try:
-        weights_path = hf_hub_download(repo_id=model_id, filename="model.safetensors", local_files_only=local_only)
-        state_dict = load_file(weights_path, device=device)
-    except Exception:
-        index_path = hf_hub_download(repo_id=model_id, filename="model.safetensors.index.json", local_files_only=local_only)
-        with open(index_path, "r") as f:
-            index = json.load(f)
-        state_dict = {}
-        for shard in set(index["weight_map"].values()):
-            state_dict.update(load_file(hf_hub_download(repo_id=model_id, filename=shard, local_files_only=local_only), device=device))
-        
-    # Map HF names -> our names
-    mapped = {}
-    for k,v in state_dict.items():
-        new_k = k
-        if new_k.startswith("model."):
-            new_k = new_k[6:]
-        new_k = new_k.replace("self_attn.", "attn.")
-        new_k = new_k.replace("input_layernorm", "input_norm")
-        new_k = new_k.replace("post_attention_layernorm", "post_norm")
-        mapped[new_k] = v.to(dtype)
+    # 2. Find shard files
+    shard_paths = _find_shard_paths(model_id, local_only)
 
-    del state_dict
-
-    # assign=True replaces meta tensors with real ones (no double allocation)
-    # Note: Tensors in 'mapped' are already on the target device and dtype.
-    missing, unexpected = model.load_state_dict(mapped, strict=False, assign=True)
+    # 3. Load weights — dual path strategy
+    if quantize:
+        _load_quantized(model, shard_paths, device, dtype)
+    else:
+        _load_standard(model, shard_paths, device, dtype)
 
-    # 1. Re-tie weights if they were tied in config.
-    # Using assign=True breaks existing tying because it replaces Parameter objects.
+    # 4. Re-tie weights if they were tied in config
+    # assign=True / per-param loading breaks existing tying because it replaces Parameter objects
     if config.tie_word_embeddings:
         model.lm_head.weight = model.embed_tokens.weight
 
-    # 2. Re-materialize RotaryEmbedding buffers on the target device.
+    # 5. Re-materialize RotaryEmbedding buffers on the target device.
     # These are computed buffers (not saved in checkpoints) that remain as
     # meta tensors after meta-device init + assign=True loading.
     from models.llama import RotaryEmbedding
@@ -101,30 +204,30 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b
             module.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
             module.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
-    # 3. Ensure all remaining tensors (e.g. missing params, other buffers) are on the correct device/dtype
-    # We use to_empty() for any remaining meta tensors, then to() for the rest.
+    # 6. Check for missing parameters and buffers before materializing them
+    missing_params = [name for name, param in model.named_parameters() if param.is_meta]
+    missing_buffers = [name for name, buf in model.named_buffers() if buf.is_meta]
+
+    if missing_params or missing_buffers:
+        raise ValueError(f"Missing weights/buffers in checkpoint! params={missing_params}, buffers={missing_buffers}")
+
+    # 7. Ensure all remaining allowed meta tensors are materialized deterministically
     for param in model.parameters():
         if param.is_meta:
-            param.data = torch.empty_like(param, device=device)
+            param.data = torch.zeros_like(param, device=device)
     for buffer in model.buffers():
         if buffer.is_meta:
-            buffer.data = torch.empty_like(buffer, device=device)
-            
-    model.to(device, dtype=dtype)
+            buffer.data = torch.zeros_like(buffer, device=device)
 
-    if missing:
-        # Filter out expected missing buffers (RoPE caches computed at runtime)
-        real_missing = [k for k in missing if "rotary_emb" not in k]
-        if real_missing:
-            print(f"Missing: {real_missing}")
-    if unexpected:
-        print(f"Unexpected: {unexpected}")
-
-    del mapped
+    # 8. Move model to target device/dtype
+    # Note: for quantized models, bnb parameters handle their own dtype,
+    # model.to() will skip them automatically
+    model.to(device, dtype=dtype)
 
     config.device = device
     model.eval()
-    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return model, config
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 834cd67..671fb23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,11 +5,13 @@ description = "A fast and efficient LLM inference engine."
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "bitsandbytes>=0.49.2",
     "huggingface-hub>=1.11.0",
     "packaging>=26.1",
     "protobuf>=7.34.1",
     "pytest>=9.0.3",
     "safetensors>=0.7.0",
+    "sentencepiece>=0.2.1",
     "torch>=2.11.0",
     "transformers>=5.5.4",
 ]
diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc
deleted file mode 100644
index e95fef7..0000000
Binary files a/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc
deleted file mode 100644
index 8d26602..0000000
Binary files a/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc
deleted file mode 100644
index eff5df4..0000000
Binary files a/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc and /dev/null differ
diff --git a/uv.lock b/uv.lock
index 99219f7..1deac9c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -24,6 +24,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
 ]
 
+[[package]]
+name = "bitsandbytes"
+version = "0.49.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/7d/f1fe0992334b18cd8494f89aeec1dcc674635584fcd9f115784fea3a1d05/bitsandbytes-0.49.2-py3-none-macosx_14_0_arm64.whl", hash = "sha256:87be5975edeac5396d699ecbc39dfc47cf2c026daaf2d5852a94368611a6823f", size = 131940, upload-time = "2026-02-16T21:26:04.572Z" },
+    { url = "https://files.pythonhosted.org/packages/29/71/acff7af06c818664aa87ff73e17a52c7788ad746b72aea09d3cb8e424348/bitsandbytes-0.49.2-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:2fc0830c5f7169be36e60e11f2be067c8f812dfcb829801a8703735842450750", size = 31442815, upload-time = "2026-02-16T21:26:06.783Z" },
+    { url = "https://files.pythonhosted.org/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:54b771f06e1a3c73af5c7f16ccf0fc23a846052813d4b008d10cb6e017dd1c8c", size = 60651714, upload-time = "2026-02-16T21:26:11.579Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/d4/501655842ad6771fb077f576d78cbedb5445d15b1c3c91343ed58ca46f0e/bitsandbytes-0.49.2-py3-none-win_amd64.whl", hash = "sha256:2e0ddd09cd778155388023cbe81f00afbb7c000c214caef3ce83386e7144df7d", size = 55372289, upload-time = "2026-02-16T21:26:16.267Z" },
+]
+
 [[package]]
 name = "certifi"
 version = "2026.2.25"
@@ -799,6 +815,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
 ]
 
+[[package]]
+name = "sentencepiece"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4a/be/32ce495aa1d0e0c323dcb1ba87096037358edee539cac5baf8755a6bd396/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57cae326c8727de58c85977b175af132a7138d84c764635d7e71bbee7e774133", size = 1943152, upload-time = "2025-08-12T06:59:40.048Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7e/ff23008899a58678e98c6ff592bf4d368eee5a71af96d0df6b38a039dd4f/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:56dd39a3c4d6493db3cdca7e8cc68c6b633f0d4195495cbadfcf5af8a22d05a6", size = 1325651, upload-time = "2025-08-12T06:59:41.536Z" },
+    { url = "https://files.pythonhosted.org/packages/19/84/42eb3ce4796777a1b5d3699dfd4dca85113e68b637f194a6c8d786f16a04/sentencepiece-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9381351182ff9888cc80e41c632e7e274b106f450de33d67a9e8f6043da6f76", size = 1253645, upload-time = "2025-08-12T06:59:42.903Z" },
+    { url = "https://files.pythonhosted.org/packages/89/fa/d3d5ebcba3cb9e6d3775a096251860c41a6bc53a1b9461151df83fe93255/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99f955df238021bf11f0fc37cdb54fd5e5b5f7fd30ecc3d93fb48b6815437167", size = 1316273, upload-time = "2025-08-12T06:59:44.476Z" },
+    { url = "https://files.pythonhosted.org/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cdfecef430d985f1c2bcbfff3defd1d95dae876fbd0173376012d2d7d24044b", size = 1387881, upload-time = "2025-08-12T06:59:46.09Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/b8/903e5ccb77b4ef140605d5d71b4f9e0ad95d456d6184688073ed11712809/sentencepiece-0.2.1-cp312-cp312-win32.whl", hash = "sha256:a483fd29a34c3e34c39ac5556b0a90942bec253d260235729e50976f5dba1068", size = 999540, upload-time = "2025-08-12T06:59:48.023Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/81/92df5673c067148c2545b1bfe49adfd775bcc3a169a047f5a0e6575ddaca/sentencepiece-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4cdc7c36234fda305e85c32949c5211faaf8dd886096c7cea289ddc12a2d02de", size = 1054671, upload-time = "2025-08-12T06:59:49.895Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/02/c5e3bc518655d714622bec87d83db9cdba1cd0619a4a04e2109751c4f47f/sentencepiece-0.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:daeb5e9e9fcad012324807856113708614d534f596d5008638eb9b40112cd9e4", size = 1033923, upload-time = "2025-08-12T06:59:51.952Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/4a/85fbe1706d4d04a7e826b53f327c4b80f849cf1c7b7c5e31a20a97d8f28b/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dcd8161eee7b41aae57ded06272905dbd680a0a04b91edd0f64790c796b2f706", size = 1943150, upload-time = "2025-08-12T06:59:53.588Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/83/4cfb393e287509fc2155480b9d184706ef8d9fa8cbf5505d02a5792bf220/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c6c8f42949f419ff8c7e9960dbadcfbc982d7b5efc2f6748210d3dd53a7de062", size = 1325651, upload-time = "2025-08-12T06:59:55.073Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/de/5a007fb53b1ab0aafc69d11a5a3dd72a289d5a3e78dcf2c3a3d9b14ffe93/sentencepiece-0.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:097f3394e99456e9e4efba1737c3749d7e23563dd1588ce71a3d007f25475fff", size = 1253641, upload-time = "2025-08-12T06:59:56.562Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/d2/f552be5928105588f4f4d66ee37dd4c61460d8097e62d0e2e0eec41bc61d/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b670879c370d350557edabadbad1f6561a9e6968126e6debca4029e5547820", size = 1316271, upload-time = "2025-08-12T06:59:58.109Z" },
+    { url = "https://files.pythonhosted.org/packages/96/df/0cfe748ace5485be740fed9476dee7877f109da32ed0d280312c94ec259f/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7f0fd2f2693309e6628aeeb2e2faf6edd221134dfccac3308ca0de01f8dab47", size = 1387882, upload-time = "2025-08-12T07:00:00.701Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/dd/f7774d42a881ced8e1739f393ab1e82ece39fc9abd4779e28050c2e975b5/sentencepiece-0.2.1-cp313-cp313-win32.whl", hash = "sha256:92b3816aa2339355fda2c8c4e021a5de92180b00aaccaf5e2808972e77a4b22f", size = 999541, upload-time = "2025-08-12T07:00:02.709Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/e9/932b9eae6fd7019548321eee1ab8d5e3b3d1294df9d9a0c9ac517c7b636d/sentencepiece-0.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:10ed3dab2044c47f7a2e7b4969b0c430420cdd45735d78c8f853191fa0e3148b", size = 1054669, upload-time = "2025-08-12T07:00:04.915Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/3a/76488a00ea7d6931689cda28726a1447d66bf1a4837943489314593d5596/sentencepiece-0.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac650534e2251083c5f75dde4ff28896ce7c8904133dc8fef42780f4d5588fcd", size = 1033922, upload-time = "2025-08-12T07:00:06.496Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/b6/08fe2ce819e02ccb0296f4843e3f195764ce9829cbda61b7513f29b95718/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:8dd4b477a7b069648d19363aad0cab9bad2f4e83b2d179be668efa672500dc94", size = 1946052, upload-time = "2025-08-12T07:00:08.136Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/d9/1ea0e740591ff4c6fc2b6eb1d7510d02f3fb885093f19b2f3abd1363b402/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0c0f672da370cc490e4c59d89e12289778310a0e71d176c541e4834759e1ae07", size = 1327408, upload-time = "2025-08-12T07:00:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/99/7e/1fb26e8a21613f6200e1ab88824d5d203714162cf2883248b517deb500b7/sentencepiece-0.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ad8493bea8432dae8d6830365352350f3b4144415a1d09c4c8cb8d30cf3b6c3c", size = 1254857, upload-time = "2025-08-12T07:00:11.021Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/85/c72fd1f3c7a6010544d6ae07f8ddb38b5e2a7e33bd4318f87266c0bbafbf/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b81a24733726e3678d2db63619acc5a8dccd074f7aa7a54ecd5ca33ca6d2d596", size = 1315722, upload-time = "2025-08-12T07:00:12.989Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/e8/661e5bd82a8aa641fd6c1020bd0e890ef73230a2b7215ddf9c8cd8e941c2/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0a81799d0a68d618e89063fb423c3001a034c893069135ffe51fee439ae474d6", size = 1387452, upload-time = "2025-08-12T07:00:15.088Z" },
+    { url = "https://files.pythonhosted.org/packages/99/5e/ae66c361023a470afcbc1fbb8da722c72ea678a2fcd9a18f1a12598c7501/sentencepiece-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:89a3ea015517c42c0341d0d962f3e6aaf2cf10d71b1932d475c44ba48d00aa2b", size = 1002501, upload-time = "2025-08-12T07:00:16.966Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/03/d332828c4ff764e16c1b56c2c8f9a33488bbe796b53fb6b9c4205ddbf167/sentencepiece-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:33f068c9382dc2e7c228eedfd8163b52baa86bb92f50d0488bf2b7da7032e484", size = 1057555, upload-time = "2025-08-12T07:00:18.573Z" },
+    { url = "https://files.pythonhosted.org/packages/88/14/5aee0bf0864df9bd82bd59e7711362908e4935e3f9cdc1f57246b5d5c9b9/sentencepiece-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:b3616ad246f360e52c85781e47682d31abfb6554c779e42b65333d4b5f44ecc0", size = 1036042, upload-time = "2025-08-12T07:00:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/24/9c/89eb8b2052f720a612478baf11c8227dcf1dc28cd4ea4c0c19506b5af2a2/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5d0350b686c320068702116276cfb26c066dc7e65cfef173980b11bb4d606719", size = 1943147, upload-time = "2025-08-12T07:00:21.809Z" },
+    { url = "https://files.pythonhosted.org/packages/82/0b/a1432bc87f97c2ace36386ca23e8bd3b91fb40581b5e6148d24b24186419/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c7f54a31cde6fa5cb030370566f68152a742f433f8d2be458463d06c208aef33", size = 1325624, upload-time = "2025-08-12T07:00:23.289Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/99/bbe054ebb5a5039457c590e0a4156ed073fb0fe9ce4f7523404dd5b37463/sentencepiece-0.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c83b85ab2d6576607f31df77ff86f28182be4a8de6d175d2c33ca609925f5da1", size = 1253670, upload-time = "2025-08-12T07:00:24.69Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ad/d5c7075f701bd97971d7c2ac2904f227566f51ef0838dfbdfdccb58cd212/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1855f57db07b51fb51ed6c9c452f570624d2b169b36f0f79ef71a6e6c618cd8b", size = 1316247, upload-time = "2025-08-12T07:00:26.435Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/03/35fbe5f3d9a7435eebd0b473e09584bd3cc354ce118b960445b060d33781/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01e6912125cb45d3792f530a4d38f8e21bf884d6b4d4ade1b2de5cf7a8d2a52b", size = 1387894, upload-time = "2025-08-12T07:00:28.339Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/aa/956ef729aafb6c8f9c443104c9636489093bb5c61d6b90fc27aa1a865574/sentencepiece-0.2.1-cp314-cp314-win32.whl", hash = "sha256:c415c9de1447e0a74ae3fdb2e52f967cb544113a3a5ce3a194df185cbc1f962f", size = 1096698, upload-time = "2025-08-12T07:00:29.764Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/cb/fe400d8836952cc535c81a0ce47dc6875160e5fedb71d2d9ff0e9894c2a6/sentencepiece-0.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:881b2e44b14fc19feade3cbed314be37de639fc415375cefaa5bc81a4be137fd", size = 1155115, upload-time = "2025-08-12T07:00:32.865Z" },
+    { url = "https://files.pythonhosted.org/packages/32/89/047921cf70f36c7b6b6390876b2399b3633ab73b8d0cb857e5a964238941/sentencepiece-0.2.1-cp314-cp314-win_arm64.whl", hash = "sha256:2005242a16d2dc3ac5fe18aa7667549134d37854823df4c4db244752453b78a8", size = 1133890, upload-time = "2025-08-12T07:00:34.763Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/11/5b414b9fae6255b5fb1e22e2ed3dc3a72d3a694e5703910e640ac78346bb/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:a19adcec27c524cb7069a1c741060add95f942d1cbf7ad0d104dffa0a7d28a2b", size = 1946081, upload-time = "2025-08-12T07:00:36.97Z" },
+    { url = "https://files.pythonhosted.org/packages/77/eb/7a5682bb25824db8545f8e5662e7f3e32d72a508fdce086029d89695106b/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e37e4b4c4a11662b5db521def4e44d4d30ae69a1743241412a93ae40fdcab4bb", size = 1327406, upload-time = "2025-08-12T07:00:38.669Z" },
+    { url = "https://files.pythonhosted.org/packages/03/b0/811dae8fb9f2784e138785d481469788f2e0d0c109c5737372454415f55f/sentencepiece-0.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:477c81505db072b3ab627e7eab972ea1025331bd3a92bacbf798df2b75ea86ec", size = 1254846, upload-time = "2025-08-12T07:00:40.611Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/23/195b2e7ec85ebb6a547969f60b723c7aca5a75800ece6cc3f41da872d14e/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:010f025a544ef770bb395091d57cb94deb9652d8972e0d09f71d85d5a0816c8c", size = 1315721, upload-time = "2025-08-12T07:00:42.914Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/aa/553dbe4178b5f23eb28e59393dddd64186178b56b81d9b8d5c3ff1c28395/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:733e59ff1794d26db706cd41fc2d7ca5f6c64a820709cb801dc0ea31780d64ab", size = 1387458, upload-time = "2025-08-12T07:00:44.56Z" },
+    { url = "https://files.pythonhosted.org/packages/66/7c/08ff0012507297a4dd74a5420fdc0eb9e3e80f4e88cab1538d7f28db303d/sentencepiece-0.2.1-cp314-cp314t-win32.whl", hash = "sha256:d3233770f78e637dc8b1fda2cd7c3b99ec77e7505041934188a4e7fe751de3b0", size = 1099765, upload-time = "2025-08-12T07:00:46.058Z" },
+    { url = "https://files.pythonhosted.org/packages/91/d5/2a69e1ce15881beb9ddfc7e3f998322f5cedcd5e4d244cb74dade9441663/sentencepiece-0.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5e4366c97b68218fd30ea72d70c525e6e78a6c0a88650f57ac4c43c63b234a9d", size = 1157807, upload-time = "2025-08-12T07:00:47.673Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/16/54f611fcfc2d1c46cbe3ec4169780b2cfa7cf63708ef2b71611136db7513/sentencepiece-0.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:105e36e75cbac1292642045458e8da677b2342dcd33df503e640f0b457cb6751", size = 1136264, upload-time = "2025-08-12T07:00:49.485Z" },
+]
+
 [[package]]
 name = "setuptools"
 version = "81.0.0"
@@ -976,22 +1040,26 @@ name = "vllmini"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "bitsandbytes" },
     { name = "huggingface-hub" },
     { name = "packaging" },
     { name = "protobuf" },
     { name = "pytest" },
     { name = "safetensors" },
+    { name = "sentencepiece" },
     { name = "torch" },
     { name = "transformers" },
 ]
 
 [package.metadata]
 requires-dist = [
+    { name = "bitsandbytes", specifier = ">=0.49.2" },
     { name = "huggingface-hub", specifier = ">=1.11.0" },
     { name = "packaging", specifier = ">=26.1" },
     { name = "protobuf", specifier = ">=7.34.1" },
     { name = "pytest", specifier = ">=9.0.3" },
     { name = "safetensors", specifier = ">=0.7.0" },
+    { name = "sentencepiece", specifier = ">=0.2.1" },
     { name = "torch", specifier = ">=2.11.0" },
     { name = "transformers", specifier = ">=5.5.4" },
 ]