generative-computing · antonpibm · Jun 9, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
@@ -80,21 +80,23 @@ granite-switch/
 
 ## Installation (local/dev)
 
+This project uses [uv](https://docs.astral.sh/uv/getting-started/installation/).
+
 ```bash
 # Core package only (config)
-pip install -e .
+uv sync
 
 # With HuggingFace backend
-pip install -e ".[hf]"
+uv sync --extra hf
 
 # With vLLM backend
-pip install -e ".[vllm]"
+uv sync --extra vllm
 
 # With compose tools
-pip install -e ".[compose]"
+uv sync --extra compose
 
 # Everything (development)
-pip install -e ".[dev]"
+uv sync --extra dev
 ```
 
 ## Import Paths

@@ -2,17 +2,27 @@
 
 Thank you for your interest in contributing to Granite Switch!
 
+## Prerequisites
+
+This project uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it once before working on the project:
+
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+Or via pip: `pip install uv`
+
 ## Getting Started
 
 1. Fork the repository
 2. Clone your fork and install dependencies:
    ```bash
    git clone https://github.com/<your-username>/granite-switch.git
    cd granite-switch
-   pip install -e ".[dev]"
+   uv sync --group dev
    ```
 3. Create a feature branch and make your changes
-4. Run tests: `pytest tests/ -v`
+4. Run tests: `uv run pytest tests/ -v`
 5. Submit a pull request
 
 ## Contribution Guidelines

@@ -51,7 +51,11 @@ pip install "granite-switch[vllm20]"    # vLLM 0.20+ (requires CUDA 13+)
 pip install "granite-switch[dev]"       # Everything
 ```
 
-Requires Python 3.9+ and PyTorch 2.0+. Two vLLM backends are available: `.[vllm]` for broad CUDA 12.x compatibility (0.19.x), and `.[vllm20]` for the latest performance improvements (CUDA 13+).
+Requires Python 3.10+ and PyTorch 2.0+.
+
+> **vLLM version note:** This project currently defaults to vLLM 0.19.1 due to vLLM 0.20's
+> dependency on CUDA 13.0+ (via PyTorch 2.11), which is incompatible with many existing
+> environments running CUDA 12.x drivers. Use `.[vllm20]` if your environment supports CUDA 13+.
 
 ### Compose a Model
 
@@ -76,6 +80,17 @@ This downloads the base model, embeds compatible LoRA adapters (with a preferenc
 
 ### Run Inference
 
+> **Tip: pre-download the model for faster startup.** The first run will download several GB from Hugging Face, which can be slow. To download in advance using the fast transfer backend:
+> ```bash
+> pip install "huggingface_hub[hf_transfer]"
+> huggingface-cli login                          # one-time, if not already logged in
+> HF_HUB_ENABLE_HF_TRANSFER=1 hf download ibm-granite/granite-switch-4.1-3b-preview
+> ```
+> Subsequent runs will use the local cache automatically.
+
+**vLLM + Mellea (recommended):**
+
+
 ```bash
 pip install mellea
 python -m vllm.entrypoints.openai.api_server --model ibm-granite/granite-switch-4.1-3b-preview --port 8000

@@ -62,7 +62,7 @@ Fixes #123
 
 Before committing:
 
-1. **Run tests**: `pytest tests/ -v`
+1. **Run tests**: `uv run pytest tests/ -v`
 2. **Check comments match code** — stale comments are worse than no comments
 3. **Update docs** if behavior changed
 

@@ -36,9 +36,6 @@ tutorials = [
     "sentence-transformers>=3.0.0",
     "datasets>=2.0.0",
 ]
-dev = ["pytest", "granite-switch[hf,vllm,compose]"]
-dev-vllm20 = ["pytest", "granite-switch[hf,vllm20,compose]"]
-test = ["pytest", "bitsandbytes", "optimum-quanto"]
 
 [project.entry-points."vllm.general_plugins"]
 register_granite_switch = "granite_switch.vllm:register"
@@ -54,32 +51,31 @@ markers = [
     "requires_model: needs a real model checkpoint",
 ]
 
+[dependency-groups]
+vllm19     = ["vllm>=0.19.1,<0.20.0"]
+vllm20     = ["vllm>=0.20.0,<0.21.0"]
+dev        = ["pytest", { include-group = "vllm19" }, "granite-switch[hf,compose]"]
+dev-vllm20 = ["pytest", { include-group = "vllm20" }, "granite-switch[hf,compose]"]
+test       = ["pytest", "bitsandbytes", "optimum-quanto", { include-group = "dev" }]
+
 [tool.uv]
+default-groups = ["vllm19"]
 conflicts = [
-    [
-        { extra = "vllm" },
-        { extra = "vllm20" },
-    ],
-    [
-        { extra = "dev" },
-        { extra = "vllm20" },
-    ],
-    [
-        { extra = "dev" },
-        { extra = "dev-vllm20" },
-    ],
-    [
-        { extra = "dev-vllm20" },
-        { extra = "vllm" },
-    ],
-    [
-        { extra = "tutorials" },
-        { extra = "vllm20" },
-    ],
-    [
-        { extra = "tutorials" },
-        { extra = "dev-vllm20" },
-    ],
+    # group-vs-group
+    [{ group = "vllm19" },     { group = "vllm20" }],
+    [{ group = "dev" },        { group = "vllm20" }],
+    [{ group = "dev" },        { group = "dev-vllm20" }],
+    [{ group = "dev-vllm20" }, { group = "vllm19" }],
+    # group-vs-extra
+    [{ group = "vllm19" },     { extra = "vllm20" }],
+    [{ group = "vllm20" },     { extra = "vllm" }],
+    [{ group = "vllm20" },     { extra = "tutorials" }],
+    [{ group = "dev" },        { extra = "vllm20" }],
+    [{ group = "dev-vllm20" }, { extra = "vllm" }],
+    [{ group = "dev-vllm20" }, { extra = "tutorials" }],
+    # extra-vs-extra
+    [{ extra = "vllm" },       { extra = "vllm20" }],
+    [{ extra = "tutorials" },  { extra = "vllm20" }],
 ]
 
 [tool.setuptools.packages.find]

@@ -20,29 +20,29 @@ Python 3.10+ is required.
 ### Base Installation
 
 ```bash
-pip install granite-switch
+pip install "granite-switch"
 ```
 
 ### HuggingFace Backend
 
 For direct model inference with HuggingFace Transformers:
 
 ```bash
-pip install "granite-switch[hf,compose]"
+pip install "granite-switch[hf]"
 ```
 
 This includes:
 - `transformers` for model loading and generation
 - `torch` with CUDA support
-- `peft` for LoRA operations
 - Compose tools for model building
 
 ### vLLM Backend
 
 For production inference with vLLM:
 
 ```bash
-pip install "granite-switch[vllm]"
+pip install "granite-switch[vllm]"    # CUDA 12.x
+pip install "granite-switch[vllm20]"  # CUDA 13+ (requires PyTorch 2.11+)
 ```
 
 This includes:

@@ -26,7 +26,7 @@ The notebook runs both servers sequentially on a single A100 GPU and produces
 - Two GPUs (one per server) for simultaneous mode, or one GPU for sequential mode
 - Install dependencies:
   ```bash
-  pip install -e ".[vllm]"
+  pip install "granite-switch[vllm]"
   pip install mellea chromadb rich tqdm transformers httpx
   ```
 - Build the ChromaDB index (once):