diff --git a/.gitignore b/.gitignore index 36b9821..16c8f68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ future_plans/ -__pycache__/ -*.pyc \ No newline at end of file +**/__pycache__/ +*.pyc + +docs/quantisation.md \ No newline at end of file diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc deleted file mode 100644 index 0d03894..0000000 Binary files a/__pycache__/main.cpython-312.pyc and /dev/null differ diff --git a/engine/__pycache__/__init__.cpython-312.pyc b/engine/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 61e8cf1..0000000 Binary files a/engine/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/engine/__pycache__/generator.cpython-312.pyc b/engine/__pycache__/generator.cpython-312.pyc deleted file mode 100644 index bee5af9..0000000 Binary files a/engine/__pycache__/generator.cpython-312.pyc and /dev/null differ diff --git a/engine/__pycache__/sampler.cpython-312.pyc b/engine/__pycache__/sampler.cpython-312.pyc deleted file mode 100644 index 947a252..0000000 Binary files a/engine/__pycache__/sampler.cpython-312.pyc and /dev/null differ diff --git a/engine/generator.py b/engine/generator.py index 8dab21f..b3c477f 100644 --- a/engine/generator.py +++ b/engine/generator.py @@ -12,7 +12,9 @@ def __init__(self, model, tokenizer, sampler: Sampler | None = None): @torch.inference_mode() def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams | None = None): input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.config.device) + prompt_len = input_ids.shape[1] past_key_values = None + prev_text = "" for _ in range(max_new_tokens): if past_key_values is None: @@ -25,5 +27,11 @@ def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams if next_token.item() == self.tokenizer.eos_token_id: break - - yield self.tokenizer.decode(next_token[0], skip_special_tokens=True) \ No newline at end of file + + # Decode all generated tokens so far and yield only the new text. + # This correctly handles SentencePiece space prefixes and multi-byte chars. + full_text = self.tokenizer.decode(input_ids[0, prompt_len:], skip_special_tokens=True) + new_text = full_text[len(prev_text):] + prev_text = full_text + if new_text: + yield new_text \ No newline at end of file diff --git a/main.py b/main.py index e3edf97..4049548 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,9 @@ """CLI entry point.""" import os +import warnings +warnings.filterwarnings("ignore", message=".*_check_is_size.*", category=FutureWarning) import argparse import torch -from huggingface_hub import try_to_load_from_cache from transformers import AutoTokenizer from models.weight_loader import load_hf_model @@ -15,15 +16,6 @@ MODEL_ID = "Qwen/Qwen3-0.6B" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" -# Resolve local cache path if model is already downloaded -# Passing a local dir path to AutoTokenizer prevents ALL network calls -_cached = try_to_load_from_cache(MODEL_ID, "config.json") -LOCAL_MODEL_PATH = os.path.dirname(_cached) if isinstance(_cached, str) else None - -if LOCAL_MODEL_PATH: - os.environ["HF_HUB_OFFLINE"] = "1" - os.environ["TRANSFORMERS_OFFLINE"] = "1" - def parse_args(): parser = argparse.ArgumentParser(description="vLLMini Chat") @@ -33,6 +25,7 @@ def parse_args(): parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature") parser.add_argument("--top-p", type=float, default=0.9, help="Nucleus sampling threshold") parser.add_argument("--max-tokens", type=int, default=2048, help="Maximum new tokens to generate") + parser.add_argument("--quantize", "-q", action="store_true", default=False, help="Enable 4-bit NF4 quantization (requires bitsandbytes)") return parser.parse_args() @@ -45,15 +38,19 @@ def strip_thinking(output: str) -> str: def main(): args = parse_args() + + # Don't force offline mode for model loading — the weight_loader + # handles local-first-then-download fallback on its own. + os.environ.pop("HF_HUB_OFFLINE", None) + os.environ.pop("TRANSFORMERS_OFFLINE", None) + + model, config = load_hf_model(args.model_id, device=args.device, quantize=args.quantize) + + tokenizer = AutoTokenizer.from_pretrained(args.model_id) + # chat = [{"role": "user", "content": "Write a short story about a robot."}] + # prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) - model, config = load_hf_model(args.model_id, device=args.device) - - # Use local model path if available (from user's caching logic) - tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH or args.model_id) - chat = [{"role": "user", "content": "Write a short story about a robot."}] - prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) - - # prompt = "Write a very long story about a robot." + prompt = "Write a very long story about a robot." params = SamplingParams(temperature=args.temperature, top_p=args.top_p) sampler = Sampler() @@ -117,6 +114,11 @@ def main(): if remainder: print(remainder, end="", flush=True) parts.append(remainder) + elif not indicator_shown and len(buffer) > 20: + # Model doesn't use tags — flush buffer and stream normally + thinking_done = True + print(buffer, end="", flush=True) + parts.append(buffer) # Otherwise keep accumulating silently else: # Either HIDE_THINKING is False, or we're past diff --git a/models/__pycache__/__init__.cpython-312.pyc b/models/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index f64a187..0000000 Binary files a/models/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/models/__pycache__/attention.cpython-312.pyc b/models/__pycache__/attention.cpython-312.pyc deleted file mode 100644 index 7c0880f..0000000 Binary files a/models/__pycache__/attention.cpython-312.pyc and /dev/null differ diff --git a/models/__pycache__/base.cpython-312.pyc b/models/__pycache__/base.cpython-312.pyc deleted file mode 100644 index 9c4f2cf..0000000 Binary files a/models/__pycache__/base.cpython-312.pyc and /dev/null differ diff --git a/models/__pycache__/llama.cpython-312.pyc b/models/__pycache__/llama.cpython-312.pyc deleted file mode 100644 index f7662be..0000000 Binary files a/models/__pycache__/llama.cpython-312.pyc and /dev/null differ diff --git a/models/__pycache__/qwen3.cpython-312.pyc b/models/__pycache__/qwen3.cpython-312.pyc deleted file mode 100644 index 69214cd..0000000 Binary files a/models/__pycache__/qwen3.cpython-312.pyc and /dev/null differ diff --git a/models/__pycache__/weight_loader.cpython-312.pyc b/models/__pycache__/weight_loader.cpython-312.pyc deleted file mode 100644 index d7ffca3..0000000 Binary files a/models/__pycache__/weight_loader.cpython-312.pyc and /dev/null differ diff --git a/models/attention.py b/models/attention.py index c26a92e..e3767fa 100644 --- a/models/attention.py +++ b/models/attention.py @@ -2,6 +2,7 @@ import torch.nn as nn import torch.nn.functional as F import math +from models.base import get_linear_layer def rotate_half(x: torch.Tensor) -> torch.Tensor: x1, x2 = x.chunk(2, dim=-1) @@ -19,10 +20,10 @@ def __init__(self, config, rotary_emb): self.head_dim = config.head_dim self.hidden_size = config.hidden_size - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) - self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias) - self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self.q_proj = get_linear_layer(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize) + self.k_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize) + self.v_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize) + self.o_proj = get_linear_layer(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias, quantize=config.quantize) self.rotary_emb = rotary_emb def core_attention(self, q, k, v, q_len, kv_len): diff --git a/models/base.py b/models/base.py index 7e20ead..6481346 100644 --- a/models/base.py +++ b/models/base.py @@ -3,6 +3,19 @@ import torch import torch.nn as nn + +def get_linear_layer(in_features: int, out_features: int, bias: bool, quantize: bool = False): + """Factory that returns nn.Linear or bnb.nn.Linear4bit depending on quantize flag.""" + if quantize: + import bitsandbytes as bnb + return bnb.nn.Linear4bit( + in_features, out_features, bias=bias, + compute_dtype=torch.bfloat16, + quant_type="nf4", + ) + return nn.Linear(in_features, out_features, bias=bias) + + class CausalLM(ABC, nn.Module): """Every model must implement this interface. The engine never looks inside.""" diff --git a/models/llama.py b/models/llama.py index 84150f1..f8f1cf5 100644 --- a/models/llama.py +++ b/models/llama.py @@ -4,7 +4,7 @@ import torch.nn as nn import torch.nn.functional as F from dataclasses import dataclass -from models.base import CausalLM +from models.base import CausalLM, get_linear_layer from models.attention import Attention, FlashAttention @@ -24,6 +24,7 @@ class LlamaConfig: head_dim: int | None = None dtype: torch.dtype = torch.bfloat16 device: str = "cuda" + quantize: bool = False def __post_init__(self): if self.head_dim is None: @@ -59,9 +60,9 @@ def forward(self, x: torch.Tensor, position_ids: torch.Tensor): class MLP(nn.Module): def __init__(self, config: LlamaConfig): super().__init__() - self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.gate_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize) + self.up_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize) + self.down_proj = get_linear_layer(config.intermediate_size, config.hidden_size, bias=False, quantize=config.quantize) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) diff --git a/models/weight_loader.py b/models/weight_loader.py index 9bae224..3da8023 100644 --- a/models/weight_loader.py +++ b/models/weight_loader.py @@ -1,10 +1,17 @@ import json import os +import gc import torch +import torch.nn as nn from safetensors.torch import load_file from models.llama import LlamaConfig, LlamaForCausalLM from models.qwen3 import QwenForCausalLM +import logging from huggingface_hub import hf_hub_download +from huggingface_hub.utils import EntryNotFoundError + +# Initialize logger +logger = logging.getLogger(__name__) # REGISTRY @@ -16,8 +23,123 @@ "qwen3": QwenForCausalLM, } -def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.bfloat16): - print(f"Loading model {model_id} to {device} with dtype {dtype}") + +def _remap_key(k: str) -> str: + """Map HuggingFace weight names -> our names.""" + if k.startswith("model."): + k = k[6:] + k = k.replace("self_attn.", "attn.") + k = k.replace("input_layernorm", "input_norm") + k = k.replace("post_attention_layernorm", "post_norm") + return k + + +def _resolve_parameter(model: nn.Module, key: str): + """Walk dot-separated key to find (parent_module, attr_name).""" + parts = key.split(".") + target = model + for part in parts[:-1]: + target = getattr(target, part) + return target, parts[-1] + + +def _find_shard_paths(model_id: str, local_only: bool) -> list[str]: + """Return list of safetensors shard paths (single file or multi-shard). + + Tries local cache first. If the file isn't cached *and* ``local_only`` + is ``False``, transparently falls back to downloading from the Hub. + """ + + def _download(filename: str, *, must_be_local: bool) -> str: + """Try local first, then online if allowed.""" + try: + return hf_hub_download(repo_id=model_id, filename=filename, local_files_only=True) + except EntryNotFoundError: + if must_be_local: + raise + return hf_hub_download(repo_id=model_id, filename=filename, local_files_only=False) + + # 1. Try single-file model + try: + return [_download("model.safetensors", must_be_local=local_only)] + except EntryNotFoundError: + # Expected if model is sharded + pass + except Exception as e: + logger.error(f"Error checking for single-file model: {e}") + raise + + # 2. Multi-shard model — get the index first, then each shard + index_path = _download("model.safetensors.index.json", must_be_local=local_only) + with open(index_path, "r") as f: + index = json.load(f) + shard_files = sorted(set(index["weight_map"].values())) + return [_download(f, must_be_local=local_only) for f in shard_files] + + +def _load_standard(model, shard_paths, device, dtype): + """Fast path: load all weights at once via load_state_dict(assign=True).""" + state_dict = {} + for path in shard_paths: + state_dict.update(load_file(path, device=device)) + + mapped = {_remap_key(k): v.to(dtype) for k, v in state_dict.items()} + del state_dict + + missing, unexpected = model.load_state_dict(mapped, strict=False, assign=True) + del mapped + + return missing, unexpected + + +def _load_quantized(model, shard_paths, device, dtype): + """Quantized path: load shard-by-shard, quantize per-parameter via Params4bit.""" + from bitsandbytes.nn import Params4bit + + for path in shard_paths: + shard = load_file(path, device="cpu") # Always load to CPU first + + keys = list(shard.keys()) + for k in keys: + v = shard.pop(k) # Pop to free memory as we iterate + new_k = _remap_key(k) + + try: + target, attr_name = _resolve_parameter(model, new_k) + param = getattr(target, attr_name) + except AttributeError: + del v + continue # Skip unmapped keys (e.g. keys we don't use) + + v_typed = v.to(dtype=dtype) + del v # Free original tensor immediately + + if hasattr(param, "quant_type"): + # This is a Linear4bit parameter — quantize and place on device + new_param = Params4bit( + v_typed, + requires_grad=False, + quant_type=getattr(param, "quant_type", "nf4"), + ) + del v_typed # Free CPU copy before GPU allocation + new_param = new_param.to(device) + setattr(target, attr_name, new_param) + else: + # Normal parameter (embeddings, norms, lm_head, etc.) + target.register_parameter( + attr_name, + nn.Parameter(v_typed.to(device), requires_grad=False), + ) + del v_typed + + del shard + gc.collect() + if device != "cpu" and torch.cuda.is_available(): + torch.cuda.empty_cache() + + +def load_hf_model(model_id: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16, quantize: bool = False): + print(f"Loading model {model_id} to {device} with dtype {dtype} (quantize={quantize})") local_only = os.environ.get("HF_HUB_OFFLINE") == "1" config_path = hf_hub_download(repo_id=model_id, filename="config.json", local_files_only=local_only) @@ -38,6 +160,7 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b attention_bias=hf.get("attention_bias", False), tie_word_embeddings=hf.get("tie_word_embeddings", False), head_dim=hf.get("head_dim"), + quantize=quantize, ) print(hf['architectures'][0]) @@ -49,45 +172,25 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b model_class = MODEL_REGISTRY[model_type] - # Initialize model on meta device (no actual memory allocation) + # 1. Initialize model on meta device (no actual memory allocation) with torch.device("meta"): model = model_class(config) - # Load weights directly to target device/dtype to avoid CPU copies - try: - weights_path = hf_hub_download(repo_id=model_id, filename="model.safetensors", local_files_only=local_only) - state_dict = load_file(weights_path, device=device) - except Exception: - index_path = hf_hub_download(repo_id=model_id, filename="model.safetensors.index.json", local_files_only=local_only) - with open(index_path, "r") as f: - index = json.load(f) - state_dict = {} - for shard in set(index["weight_map"].values()): - state_dict.update(load_file(hf_hub_download(repo_id=model_id, filename=shard, local_files_only=local_only), device=device)) - - # Map HF names -> our names - mapped = {} - for k,v in state_dict.items(): - new_k = k - if new_k.startswith("model."): - new_k = new_k[6:] - new_k = new_k.replace("self_attn.", "attn.") - new_k = new_k.replace("input_layernorm", "input_norm") - new_k = new_k.replace("post_attention_layernorm", "post_norm") - mapped[new_k] = v.to(dtype) + # 2. Find shard files + shard_paths = _find_shard_paths(model_id, local_only) - del state_dict - - # assign=True replaces meta tensors with real ones (no double allocation) - # Note: Tensors in 'mapped' are already on the target device and dtype. - missing, unexpected = model.load_state_dict(mapped, strict=False, assign=True) + # 3. Load weights — dual path strategy + if quantize: + _load_quantized(model, shard_paths, device, dtype) + else: + _load_standard(model, shard_paths, device, dtype) - # 1. Re-tie weights if they were tied in config. - # Using assign=True breaks existing tying because it replaces Parameter objects. + # 4. Re-tie weights if they were tied in config + # assign=True / per-param loading breaks existing tying because it replaces Parameter objects if config.tie_word_embeddings: model.lm_head.weight = model.embed_tokens.weight - # 2. Re-materialize RotaryEmbedding buffers on the target device. + # 5. Re-materialize RotaryEmbedding buffers on the target device. # These are computed buffers (not saved in checkpoints) that remain as # meta tensors after meta-device init + assign=True loading. from models.llama import RotaryEmbedding @@ -101,30 +204,30 @@ def load_hf_model(model_id:str, device:str = "cuda", dtype:torch.dtype = torch.b module.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) module.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - # 3. Ensure all remaining tensors (e.g. missing params, other buffers) are on the correct device/dtype - # We use to_empty() for any remaining meta tensors, then to() for the rest. + # 6. Check for missing parameters and buffers before materializing them + missing_params = [name for name, param in model.named_parameters() if param.is_meta] + missing_buffers = [name for name, buf in model.named_buffers() if buf.is_meta] + + if missing_params or missing_buffers: + raise ValueError(f"Missing weights/buffers in checkpoint! params={missing_params}, buffers={missing_buffers}") + + # 7. Ensure all remaining allowed meta tensors are materialized deterministically for param in model.parameters(): if param.is_meta: - param.data = torch.empty_like(param, device=device) + param.data = torch.zeros_like(param, device=device) for buffer in model.buffers(): if buffer.is_meta: - buffer.data = torch.empty_like(buffer, device=device) - - model.to(device, dtype=dtype) + buffer.data = torch.zeros_like(buffer, device=device) - if missing: - # Filter out expected missing buffers (RoPE caches computed at runtime) - real_missing = [k for k in missing if "rotary_emb" not in k] - if real_missing: - print(f"Missing: {real_missing}") - if unexpected: - print(f"Unexpected: {unexpected}") - - del mapped + # 8. Move model to target device/dtype + # Note: for quantized models, bnb parameters handle their own dtype, + # model.to() will skip them automatically + model.to(device, dtype=dtype) config.device = device model.eval() - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() return model, config diff --git a/pyproject.toml b/pyproject.toml index 834cd67..671fb23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,11 +5,13 @@ description = "A fast and efficient LLM inference engine." readme = "README.md" requires-python = ">=3.12" dependencies = [ + "bitsandbytes>=0.49.2", "huggingface-hub>=1.11.0", "packaging>=26.1", "protobuf>=7.34.1", "pytest>=9.0.3", "safetensors>=0.7.0", + "sentencepiece>=0.2.1", "torch>=2.11.0", "transformers>=5.5.4", ] diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc deleted file mode 100644 index e95fef7..0000000 Binary files a/tests/__pycache__/conftest.cpython-312-pytest-9.0.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc deleted file mode 100644 index 8d26602..0000000 Binary files a/tests/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc deleted file mode 100644 index eff5df4..0000000 Binary files a/tests/__pycache__/test_sampler.cpython-312-pytest-9.0.3.pyc and /dev/null differ diff --git a/uv.lock b/uv.lock index 99219f7..1deac9c 100644 --- a/uv.lock +++ b/uv.lock @@ -24,6 +24,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, ] +[[package]] +name = "bitsandbytes" +version = "0.49.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "torch" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/7d/f1fe0992334b18cd8494f89aeec1dcc674635584fcd9f115784fea3a1d05/bitsandbytes-0.49.2-py3-none-macosx_14_0_arm64.whl", hash = "sha256:87be5975edeac5396d699ecbc39dfc47cf2c026daaf2d5852a94368611a6823f", size = 131940, upload-time = "2026-02-16T21:26:04.572Z" }, + { url = "https://files.pythonhosted.org/packages/29/71/acff7af06c818664aa87ff73e17a52c7788ad746b72aea09d3cb8e424348/bitsandbytes-0.49.2-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:2fc0830c5f7169be36e60e11f2be067c8f812dfcb829801a8703735842450750", size = 31442815, upload-time = "2026-02-16T21:26:06.783Z" }, + { url = "https://files.pythonhosted.org/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:54b771f06e1a3c73af5c7f16ccf0fc23a846052813d4b008d10cb6e017dd1c8c", size = 60651714, upload-time = "2026-02-16T21:26:11.579Z" }, + { url = "https://files.pythonhosted.org/packages/b6/d4/501655842ad6771fb077f576d78cbedb5445d15b1c3c91343ed58ca46f0e/bitsandbytes-0.49.2-py3-none-win_amd64.whl", hash = "sha256:2e0ddd09cd778155388023cbe81f00afbb7c000c214caef3ce83386e7144df7d", size = 55372289, upload-time = "2026-02-16T21:26:16.267Z" }, +] + [[package]] name = "certifi" version = "2026.2.25" @@ -799,6 +815,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, ] +[[package]] +name = "sentencepiece" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/be/32ce495aa1d0e0c323dcb1ba87096037358edee539cac5baf8755a6bd396/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57cae326c8727de58c85977b175af132a7138d84c764635d7e71bbee7e774133", size = 1943152, upload-time = "2025-08-12T06:59:40.048Z" }, + { url = "https://files.pythonhosted.org/packages/88/7e/ff23008899a58678e98c6ff592bf4d368eee5a71af96d0df6b38a039dd4f/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:56dd39a3c4d6493db3cdca7e8cc68c6b633f0d4195495cbadfcf5af8a22d05a6", size = 1325651, upload-time = "2025-08-12T06:59:41.536Z" }, + { url = "https://files.pythonhosted.org/packages/19/84/42eb3ce4796777a1b5d3699dfd4dca85113e68b637f194a6c8d786f16a04/sentencepiece-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9381351182ff9888cc80e41c632e7e274b106f450de33d67a9e8f6043da6f76", size = 1253645, upload-time = "2025-08-12T06:59:42.903Z" }, + { url = "https://files.pythonhosted.org/packages/89/fa/d3d5ebcba3cb9e6d3775a096251860c41a6bc53a1b9461151df83fe93255/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99f955df238021bf11f0fc37cdb54fd5e5b5f7fd30ecc3d93fb48b6815437167", size = 1316273, upload-time = "2025-08-12T06:59:44.476Z" }, + { url = "https://files.pythonhosted.org/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cdfecef430d985f1c2bcbfff3defd1d95dae876fbd0173376012d2d7d24044b", size = 1387881, upload-time = "2025-08-12T06:59:46.09Z" }, + { url = "https://files.pythonhosted.org/packages/fd/b8/903e5ccb77b4ef140605d5d71b4f9e0ad95d456d6184688073ed11712809/sentencepiece-0.2.1-cp312-cp312-win32.whl", hash = "sha256:a483fd29a34c3e34c39ac5556b0a90942bec253d260235729e50976f5dba1068", size = 999540, upload-time = "2025-08-12T06:59:48.023Z" }, + { url = "https://files.pythonhosted.org/packages/2d/81/92df5673c067148c2545b1bfe49adfd775bcc3a169a047f5a0e6575ddaca/sentencepiece-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4cdc7c36234fda305e85c32949c5211faaf8dd886096c7cea289ddc12a2d02de", size = 1054671, upload-time = "2025-08-12T06:59:49.895Z" }, + { url = "https://files.pythonhosted.org/packages/fe/02/c5e3bc518655d714622bec87d83db9cdba1cd0619a4a04e2109751c4f47f/sentencepiece-0.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:daeb5e9e9fcad012324807856113708614d534f596d5008638eb9b40112cd9e4", size = 1033923, upload-time = "2025-08-12T06:59:51.952Z" }, + { url = "https://files.pythonhosted.org/packages/ba/4a/85fbe1706d4d04a7e826b53f327c4b80f849cf1c7b7c5e31a20a97d8f28b/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dcd8161eee7b41aae57ded06272905dbd680a0a04b91edd0f64790c796b2f706", size = 1943150, upload-time = "2025-08-12T06:59:53.588Z" }, + { url = "https://files.pythonhosted.org/packages/c2/83/4cfb393e287509fc2155480b9d184706ef8d9fa8cbf5505d02a5792bf220/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c6c8f42949f419ff8c7e9960dbadcfbc982d7b5efc2f6748210d3dd53a7de062", size = 1325651, upload-time = "2025-08-12T06:59:55.073Z" }, + { url = "https://files.pythonhosted.org/packages/8d/de/5a007fb53b1ab0aafc69d11a5a3dd72a289d5a3e78dcf2c3a3d9b14ffe93/sentencepiece-0.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:097f3394e99456e9e4efba1737c3749d7e23563dd1588ce71a3d007f25475fff", size = 1253641, upload-time = "2025-08-12T06:59:56.562Z" }, + { url = "https://files.pythonhosted.org/packages/2c/d2/f552be5928105588f4f4d66ee37dd4c61460d8097e62d0e2e0eec41bc61d/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b670879c370d350557edabadbad1f6561a9e6968126e6debca4029e5547820", size = 1316271, upload-time = "2025-08-12T06:59:58.109Z" }, + { url = "https://files.pythonhosted.org/packages/96/df/0cfe748ace5485be740fed9476dee7877f109da32ed0d280312c94ec259f/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7f0fd2f2693309e6628aeeb2e2faf6edd221134dfccac3308ca0de01f8dab47", size = 1387882, upload-time = "2025-08-12T07:00:00.701Z" }, + { url = "https://files.pythonhosted.org/packages/ac/dd/f7774d42a881ced8e1739f393ab1e82ece39fc9abd4779e28050c2e975b5/sentencepiece-0.2.1-cp313-cp313-win32.whl", hash = "sha256:92b3816aa2339355fda2c8c4e021a5de92180b00aaccaf5e2808972e77a4b22f", size = 999541, upload-time = "2025-08-12T07:00:02.709Z" }, + { url = "https://files.pythonhosted.org/packages/dd/e9/932b9eae6fd7019548321eee1ab8d5e3b3d1294df9d9a0c9ac517c7b636d/sentencepiece-0.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:10ed3dab2044c47f7a2e7b4969b0c430420cdd45735d78c8f853191fa0e3148b", size = 1054669, upload-time = "2025-08-12T07:00:04.915Z" }, + { url = "https://files.pythonhosted.org/packages/c9/3a/76488a00ea7d6931689cda28726a1447d66bf1a4837943489314593d5596/sentencepiece-0.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac650534e2251083c5f75dde4ff28896ce7c8904133dc8fef42780f4d5588fcd", size = 1033922, upload-time = "2025-08-12T07:00:06.496Z" }, + { url = "https://files.pythonhosted.org/packages/4a/b6/08fe2ce819e02ccb0296f4843e3f195764ce9829cbda61b7513f29b95718/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:8dd4b477a7b069648d19363aad0cab9bad2f4e83b2d179be668efa672500dc94", size = 1946052, upload-time = "2025-08-12T07:00:08.136Z" }, + { url = "https://files.pythonhosted.org/packages/ab/d9/1ea0e740591ff4c6fc2b6eb1d7510d02f3fb885093f19b2f3abd1363b402/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0c0f672da370cc490e4c59d89e12289778310a0e71d176c541e4834759e1ae07", size = 1327408, upload-time = "2025-08-12T07:00:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/99/7e/1fb26e8a21613f6200e1ab88824d5d203714162cf2883248b517deb500b7/sentencepiece-0.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ad8493bea8432dae8d6830365352350f3b4144415a1d09c4c8cb8d30cf3b6c3c", size = 1254857, upload-time = "2025-08-12T07:00:11.021Z" }, + { url = "https://files.pythonhosted.org/packages/bc/85/c72fd1f3c7a6010544d6ae07f8ddb38b5e2a7e33bd4318f87266c0bbafbf/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b81a24733726e3678d2db63619acc5a8dccd074f7aa7a54ecd5ca33ca6d2d596", size = 1315722, upload-time = "2025-08-12T07:00:12.989Z" }, + { url = "https://files.pythonhosted.org/packages/4a/e8/661e5bd82a8aa641fd6c1020bd0e890ef73230a2b7215ddf9c8cd8e941c2/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0a81799d0a68d618e89063fb423c3001a034c893069135ffe51fee439ae474d6", size = 1387452, upload-time = "2025-08-12T07:00:15.088Z" }, + { url = "https://files.pythonhosted.org/packages/99/5e/ae66c361023a470afcbc1fbb8da722c72ea678a2fcd9a18f1a12598c7501/sentencepiece-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:89a3ea015517c42c0341d0d962f3e6aaf2cf10d71b1932d475c44ba48d00aa2b", size = 1002501, upload-time = "2025-08-12T07:00:16.966Z" }, + { url = "https://files.pythonhosted.org/packages/c1/03/d332828c4ff764e16c1b56c2c8f9a33488bbe796b53fb6b9c4205ddbf167/sentencepiece-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:33f068c9382dc2e7c228eedfd8163b52baa86bb92f50d0488bf2b7da7032e484", size = 1057555, upload-time = "2025-08-12T07:00:18.573Z" }, + { url = "https://files.pythonhosted.org/packages/88/14/5aee0bf0864df9bd82bd59e7711362908e4935e3f9cdc1f57246b5d5c9b9/sentencepiece-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:b3616ad246f360e52c85781e47682d31abfb6554c779e42b65333d4b5f44ecc0", size = 1036042, upload-time = "2025-08-12T07:00:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/24/9c/89eb8b2052f720a612478baf11c8227dcf1dc28cd4ea4c0c19506b5af2a2/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5d0350b686c320068702116276cfb26c066dc7e65cfef173980b11bb4d606719", size = 1943147, upload-time = "2025-08-12T07:00:21.809Z" }, + { url = "https://files.pythonhosted.org/packages/82/0b/a1432bc87f97c2ace36386ca23e8bd3b91fb40581b5e6148d24b24186419/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c7f54a31cde6fa5cb030370566f68152a742f433f8d2be458463d06c208aef33", size = 1325624, upload-time = "2025-08-12T07:00:23.289Z" }, + { url = "https://files.pythonhosted.org/packages/ea/99/bbe054ebb5a5039457c590e0a4156ed073fb0fe9ce4f7523404dd5b37463/sentencepiece-0.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c83b85ab2d6576607f31df77ff86f28182be4a8de6d175d2c33ca609925f5da1", size = 1253670, upload-time = "2025-08-12T07:00:24.69Z" }, + { url = "https://files.pythonhosted.org/packages/19/ad/d5c7075f701bd97971d7c2ac2904f227566f51ef0838dfbdfdccb58cd212/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1855f57db07b51fb51ed6c9c452f570624d2b169b36f0f79ef71a6e6c618cd8b", size = 1316247, upload-time = "2025-08-12T07:00:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/fb/03/35fbe5f3d9a7435eebd0b473e09584bd3cc354ce118b960445b060d33781/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01e6912125cb45d3792f530a4d38f8e21bf884d6b4d4ade1b2de5cf7a8d2a52b", size = 1387894, upload-time = "2025-08-12T07:00:28.339Z" }, + { url = "https://files.pythonhosted.org/packages/dc/aa/956ef729aafb6c8f9c443104c9636489093bb5c61d6b90fc27aa1a865574/sentencepiece-0.2.1-cp314-cp314-win32.whl", hash = "sha256:c415c9de1447e0a74ae3fdb2e52f967cb544113a3a5ce3a194df185cbc1f962f", size = 1096698, upload-time = "2025-08-12T07:00:29.764Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/fe400d8836952cc535c81a0ce47dc6875160e5fedb71d2d9ff0e9894c2a6/sentencepiece-0.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:881b2e44b14fc19feade3cbed314be37de639fc415375cefaa5bc81a4be137fd", size = 1155115, upload-time = "2025-08-12T07:00:32.865Z" }, + { url = "https://files.pythonhosted.org/packages/32/89/047921cf70f36c7b6b6390876b2399b3633ab73b8d0cb857e5a964238941/sentencepiece-0.2.1-cp314-cp314-win_arm64.whl", hash = "sha256:2005242a16d2dc3ac5fe18aa7667549134d37854823df4c4db244752453b78a8", size = 1133890, upload-time = "2025-08-12T07:00:34.763Z" }, + { url = "https://files.pythonhosted.org/packages/a1/11/5b414b9fae6255b5fb1e22e2ed3dc3a72d3a694e5703910e640ac78346bb/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:a19adcec27c524cb7069a1c741060add95f942d1cbf7ad0d104dffa0a7d28a2b", size = 1946081, upload-time = "2025-08-12T07:00:36.97Z" }, + { url = "https://files.pythonhosted.org/packages/77/eb/7a5682bb25824db8545f8e5662e7f3e32d72a508fdce086029d89695106b/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e37e4b4c4a11662b5db521def4e44d4d30ae69a1743241412a93ae40fdcab4bb", size = 1327406, upload-time = "2025-08-12T07:00:38.669Z" }, + { url = "https://files.pythonhosted.org/packages/03/b0/811dae8fb9f2784e138785d481469788f2e0d0c109c5737372454415f55f/sentencepiece-0.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:477c81505db072b3ab627e7eab972ea1025331bd3a92bacbf798df2b75ea86ec", size = 1254846, upload-time = "2025-08-12T07:00:40.611Z" }, + { url = "https://files.pythonhosted.org/packages/ef/23/195b2e7ec85ebb6a547969f60b723c7aca5a75800ece6cc3f41da872d14e/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:010f025a544ef770bb395091d57cb94deb9652d8972e0d09f71d85d5a0816c8c", size = 1315721, upload-time = "2025-08-12T07:00:42.914Z" }, + { url = "https://files.pythonhosted.org/packages/7e/aa/553dbe4178b5f23eb28e59393dddd64186178b56b81d9b8d5c3ff1c28395/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:733e59ff1794d26db706cd41fc2d7ca5f6c64a820709cb801dc0ea31780d64ab", size = 1387458, upload-time = "2025-08-12T07:00:44.56Z" }, + { url = "https://files.pythonhosted.org/packages/66/7c/08ff0012507297a4dd74a5420fdc0eb9e3e80f4e88cab1538d7f28db303d/sentencepiece-0.2.1-cp314-cp314t-win32.whl", hash = "sha256:d3233770f78e637dc8b1fda2cd7c3b99ec77e7505041934188a4e7fe751de3b0", size = 1099765, upload-time = "2025-08-12T07:00:46.058Z" }, + { url = "https://files.pythonhosted.org/packages/91/d5/2a69e1ce15881beb9ddfc7e3f998322f5cedcd5e4d244cb74dade9441663/sentencepiece-0.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5e4366c97b68218fd30ea72d70c525e6e78a6c0a88650f57ac4c43c63b234a9d", size = 1157807, upload-time = "2025-08-12T07:00:47.673Z" }, + { url = "https://files.pythonhosted.org/packages/f3/16/54f611fcfc2d1c46cbe3ec4169780b2cfa7cf63708ef2b71611136db7513/sentencepiece-0.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:105e36e75cbac1292642045458e8da677b2342dcd33df503e640f0b457cb6751", size = 1136264, upload-time = "2025-08-12T07:00:49.485Z" }, +] + [[package]] name = "setuptools" version = "81.0.0" @@ -976,22 +1040,26 @@ name = "vllmini" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "bitsandbytes" }, { name = "huggingface-hub" }, { name = "packaging" }, { name = "protobuf" }, { name = "pytest" }, { name = "safetensors" }, + { name = "sentencepiece" }, { name = "torch" }, { name = "transformers" }, ] [package.metadata] requires-dist = [ + { name = "bitsandbytes", specifier = ">=0.49.2" }, { name = "huggingface-hub", specifier = ">=1.11.0" }, { name = "packaging", specifier = ">=26.1" }, { name = "protobuf", specifier = ">=7.34.1" }, { name = "pytest", specifier = ">=9.0.3" }, { name = "safetensors", specifier = ">=0.7.0" }, + { name = "sentencepiece", specifier = ">=0.2.1" }, { name = "torch", specifier = ">=2.11.0" }, { name = "transformers", specifier = ">=5.5.4" }, ]