Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
future_plans/
__pycache__/
*.pyc
**/__pycache__/
*.pyc

docs/quantisation.md

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Description: Check if quantisation.md exists and preview its content to determine if it should be tracked

# Check if the file exists
if [ -f "docs/quantisation.md" ]; then
  echo "File exists. First 20 lines:"
  head -n 20 docs/quantisation.md
  echo ""
  echo "File size:"
  wc -l docs/quantisation.md
else
  echo "File does not exist yet"
fi

Repository: Lothnic/vllmini

Length of output: 83


🏁 Script executed:

# Check git diff to see what's being changed
git diff --name-only

# Check what files are in the docs directory
ls -la docs/ 2>/dev/null || echo "docs directory does not exist or is empty"

# Search for any quantisation-related files
find . -name "*quantisation*" -o -name "*quantization*" 2>/dev/null | head -20

# Check the .gitignore file to see context
cat -n .gitignore

Repository: Lothnic/vllmini

Length of output: 322


🏁 Script executed:

# Search for quantisation/quantization references
rg -i "quantis" --type py --type md --type txt --type sh

# Check for documentation generation scripts
fd -e "py" -e "sh" -e "mk" | xargs rg -l "doc\|md\|markdown" 2>/dev/null | head -20

# Look at git log to see if quantisation.md is mentioned in any commits
git log --all --oneline --grep="quantisation\|quantization" 2>/dev/null | head -10

# Check if there are any build or doc generation scripts
fd "build\|gen\|make" -type f | head -20

Repository: Lothnic/vllmini

Length of output: 497


🏁 Script executed:

# Search for quantisation in Python source files
rg -i "quantis" --type py -A 3 -B 1

# Check what Python files contain quantisation-related code
rg -i "quantis" --type py -l

# Look for bitsandbytes imports or usage
rg "bitsandbytes\|bnb" --type py

Repository: Lothnic/vllmini

Length of output: 41


Remove docs/quantisation.md from .gitignore or provide clarification.

Documentation files should typically be committed to the repository. The docs/ directory already tracks sampler.md, and this project follows a pattern of version-controlling documentation. Preemptively ignoring a non-existent documentation file for a feature currently on a separate branch is inconsistent with the project's practices and may cause confusion when the quantisation documentation is later created.

If this file will be auto-generated or excluded for a specific reason, clarify that in the commit message.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.gitignore at line 5, Remove the stale ignore entry "docs/quantisation.md"
from .gitignore (or replace it with a commented explanation) so documentation
files are tracked consistently; locate the "docs/quantisation.md" line in
.gitignore and either delete that line or change it to a commented note
explaining why this specific doc should be ignored/generated (and include the
rationale in the commit message).

Binary file removed __pycache__/main.cpython-312.pyc
Binary file not shown.
Binary file removed engine/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file removed engine/__pycache__/generator.cpython-312.pyc
Binary file not shown.
Binary file removed engine/__pycache__/sampler.cpython-312.pyc
Binary file not shown.
12 changes: 10 additions & 2 deletions engine/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ def __init__(self, model, tokenizer, sampler: Sampler | None = None):
@torch.inference_mode()
def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams | None = None):
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.config.device)
prompt_len = input_ids.shape[1]
past_key_values = None
prev_text = ""

for _ in range(max_new_tokens):
if past_key_values is None:
Expand All @@ -25,5 +27,11 @@ def generate(self, prompt: str, max_new_tokens: int = 50, params: SamplingParams

if next_token.item() == self.tokenizer.eos_token_id:
break

yield self.tokenizer.decode(next_token[0], skip_special_tokens=True)

# Decode all generated tokens so far and yield only the new text.
# This correctly handles SentencePiece space prefixes and multi-byte chars.
full_text = self.tokenizer.decode(input_ids[0, prompt_len:], skip_special_tokens=True)
new_text = full_text[len(prev_text):]
prev_text = full_text
if new_text:
yield new_text
38 changes: 20 additions & 18 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""CLI entry point."""
import os
import warnings
warnings.filterwarnings("ignore", message=".*_check_is_size.*", category=FutureWarning)
import argparse
import torch
from huggingface_hub import try_to_load_from_cache
from transformers import AutoTokenizer

from models.weight_loader import load_hf_model
Expand All @@ -15,15 +16,6 @@
MODEL_ID = "Qwen/Qwen3-0.6B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Resolve local cache path if model is already downloaded
# Passing a local dir path to AutoTokenizer prevents ALL network calls
_cached = try_to_load_from_cache(MODEL_ID, "config.json")
LOCAL_MODEL_PATH = os.path.dirname(_cached) if isinstance(_cached, str) else None

if LOCAL_MODEL_PATH:
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"


def parse_args():
parser = argparse.ArgumentParser(description="vLLMini Chat")
Expand All @@ -33,6 +25,7 @@ def parse_args():
parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
parser.add_argument("--top-p", type=float, default=0.9, help="Nucleus sampling threshold")
parser.add_argument("--max-tokens", type=int, default=2048, help="Maximum new tokens to generate")
parser.add_argument("--quantize", "-q", action="store_true", default=False, help="Enable 4-bit NF4 quantization (requires bitsandbytes)")
return parser.parse_args()


Expand All @@ -45,15 +38,19 @@ def strip_thinking(output: str) -> str:

def main():
args = parse_args()

# Don't force offline mode for model loading — the weight_loader
# handles local-first-then-download fallback on its own.
os.environ.pop("HF_HUB_OFFLINE", None)
os.environ.pop("TRANSFORMERS_OFFLINE", None)
Comment on lines +41 to +45

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Removing user-set environment variables may cause unexpected behavior.

Explicitly removing HF_HUB_OFFLINE and TRANSFORMERS_OFFLINE could surprise users who intentionally set these variables in their environment. The weight loader's local-first fallback doesn't fully replicate offline mode semantics (which prevents any network access).

Consider logging when these are removed, or respecting them if explicitly set.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@main.py` around lines 41 - 45, Don't unconditionally remove HF_HUB_OFFLINE
and TRANSFORMERS_OFFLINE; instead, stop calling os.environ.pop("HF_HUB_OFFLINE",
None) and os.environ.pop("TRANSFORMERS_OFFLINE", None) and implement a
conditional path: check for those keys in os.environ and if present, write a
clear log message (e.g., via logger.warn/info) that the process will respect the
user's offline setting and avoid forcing online model downloads, otherwise
proceed with normal model-loading behavior (local-first-then-download). Update
the code around the existing os.environ.pop calls in main.py to perform the
presence check and logging rather than removing the variables.


model, config = load_hf_model(args.model_id, device=args.device, quantize=args.quantize)

tokenizer = AutoTokenizer.from_pretrained(args.model_id)
# chat = [{"role": "user", "content": "Write a short story about a robot."}]
# prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

model, config = load_hf_model(args.model_id, device=args.device)

# Use local model path if available (from user's caching logic)
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH or args.model_id)
chat = [{"role": "user", "content": "Write a short story about a robot."}]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

# prompt = "Write a very long story about a robot."
prompt = "Write a very long story about a robot."

params = SamplingParams(temperature=args.temperature, top_p=args.top_p)
sampler = Sampler()
Expand Down Expand Up @@ -117,6 +114,11 @@ def main():
if remainder:
print(remainder, end="", flush=True)
parts.append(remainder)
elif not indicator_shown and len(buffer) > 20:
# Model doesn't use <think> tags — flush buffer and stream normally
thinking_done = True
print(buffer, end="", flush=True)
parts.append(buffer)
# Otherwise keep accumulating silently
else:
# Either HIDE_THINKING is False, or we're past </think>
Expand Down
Binary file removed models/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file removed models/__pycache__/attention.cpython-312.pyc
Binary file not shown.
Binary file removed models/__pycache__/base.cpython-312.pyc
Binary file not shown.
Binary file removed models/__pycache__/llama.cpython-312.pyc
Binary file not shown.
Binary file removed models/__pycache__/qwen3.cpython-312.pyc
Binary file not shown.
Binary file removed models/__pycache__/weight_loader.cpython-312.pyc
Binary file not shown.
9 changes: 5 additions & 4 deletions models/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import torch.nn as nn
import torch.nn.functional as F
import math
from models.base import get_linear_layer

def rotate_half(x: torch.Tensor) -> torch.Tensor:
x1, x2 = x.chunk(2, dim=-1)
Expand All @@ -19,10 +20,10 @@ def __init__(self, config, rotary_emb):
self.head_dim = config.head_dim
self.hidden_size = config.hidden_size

self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
self.q_proj = get_linear_layer(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
self.k_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
self.v_proj = get_linear_layer(self.hidden_size, self.num_kv_heads * self.head_dim, bias=config.attention_bias, quantize=config.quantize)
self.o_proj = get_linear_layer(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias, quantize=config.quantize)
self.rotary_emb = rotary_emb

def core_attention(self, q, k, v, q_len, kv_len):
Expand Down
13 changes: 13 additions & 0 deletions models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
import torch
import torch.nn as nn


def get_linear_layer(in_features: int, out_features: int, bias: bool, quantize: bool = False):
"""Factory that returns nn.Linear or bnb.nn.Linear4bit depending on quantize flag."""
if quantize:
import bitsandbytes as bnb
return bnb.nn.Linear4bit(
in_features, out_features, bias=bias,
compute_dtype=torch.bfloat16,
quant_type="nf4",
)
return nn.Linear(in_features, out_features, bias=bias)


class CausalLM(ABC, nn.Module):
"""Every model must implement this interface. The engine never looks inside."""

Expand Down
9 changes: 5 additions & 4 deletions models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from models.base import CausalLM
from models.base import CausalLM, get_linear_layer
from models.attention import Attention, FlashAttention


Expand All @@ -24,6 +24,7 @@ class LlamaConfig:
head_dim: int | None = None
dtype: torch.dtype = torch.bfloat16
device: str = "cuda"
quantize: bool = False

def __post_init__(self):
if self.head_dim is None:
Expand Down Expand Up @@ -59,9 +60,9 @@ def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
class MLP(nn.Module):
def __init__(self, config: LlamaConfig):
super().__init__()
self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
self.gate_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
self.up_proj = get_linear_layer(config.hidden_size, config.intermediate_size, bias=False, quantize=config.quantize)
self.down_proj = get_linear_layer(config.intermediate_size, config.hidden_size, bias=False, quantize=config.quantize)

def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
Expand Down
Loading
Loading