From 8342c76d80713a199013155accf7cbe5a2dea739 Mon Sep 17 00:00:00 2001
From: vincentzed <207368749+vincentzed@users.noreply.github.com>
Date: Fri, 30 Jan 2026 23:02:12 -0500
Subject: [PATCH 1/2] WIP add sglang
Signed-off-by: vincentzed <207368749+vincentzed@users.noreply.github.com>
---
docs.json | 1 +
docs/inference/sglang.mdx | 408 ++++++++++++++++++++++++++++++++++++++
2 files changed, 409 insertions(+)
create mode 100644 docs/inference/sglang.mdx
diff --git a/docs.json b/docs.json
index 1e1bcbd..8dade2d 100644
--- a/docs.json
+++ b/docs.json
@@ -90,6 +90,7 @@
"docs/inference/transformers",
"docs/inference/llama-cpp",
"docs/inference/vllm",
+ "docs/inference/sglang",
"docs/inference/mlx",
"docs/inference/ollama",
{
diff --git a/docs/inference/sglang.mdx b/docs/inference/sglang.mdx
new file mode 100644
index 0000000..0bb3408
--- /dev/null
+++ b/docs/inference/sglang.mdx
@@ -0,0 +1,408 @@
+---
+title: "SGLang"
+description: "SGLang is a fast serving framework for large language models. It features RadixAttention for efficient prefix caching, optimized CUDA kernels, and continuous batching for high-throughput, low-latency inference."
+---
+
+
+ Use SGLang for low-latency online deployments such as RAG pipelines, search engines, and real-time chat applications.
+
+
+SGLang delivers ultra-low latency and high throughput, making it well-suited for production serving scenarios with many concurrent requests. It requires a CUDA-compatible GPU. For CPU-only environments, consider using [llama.cpp](/docs/inference/llama-cpp) instead.
+
+## Installation
+
+Install SGLang following the [official installation guide](https://docs.sglang.io/get_started/install_sglang.html). The recommended method is:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang"
+```
+
+For other installation methods (source, Kubernetes), refer to the [SGLang documentation](https://docs.sglang.io/get_started/install_sglang.html).
+
+### Docker
+
+
+Please use the `dev` tag for LFM model support, as no stable release of SGLang supports it yet. For CUDA 13 environments (B300/GB300), use `lmsysorg/sglang:dev-cu13`.
+
+
+You can also run SGLang using Docker:
+
+```bash
+docker run --gpus all \
+ --shm-size 32g \
+ -p 30000:30000 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HF_TOKEN=" \
+ --ipc=host \
+ lmsysorg/sglang:dev \
+ python3 -m sglang.launch_server \
+ --model-path LiquidAI/LFM2.5-1.2B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --chunked-prefill-size -1
+```
+
+## Launching the Server
+
+Start the SGLang server with the following command:
+
+```bash
+python3 -m sglang.launch_server \
+ --model LiquidAI/LFM2.5-1.2B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --chunked-prefill-size -1
+```
+
+Optional parameters:
+
+* `--chunked-prefill-size -1`: Disables chunked prefill for lower latency
+* `--max-model-len L`: Set maximum context length
+* `--dtype auto`: Automatically select the data type
+
+### Ultra Low Latency on Blackwell (B300)
+
+Running a 1.2B model on a B300 may sound counterintuitive, but combining `--enable-torch-compile` with Blackwell's architecture unlocks extremely low latency — ideal for latency-sensitive workloads like RAG, search, and real-time chat.
+
+
+ If your workload has concurrency under 256, we recommend using `--enable-torch-compile` for significantly lower latency. For pure throughput batch processing at very high concurrency, skip this flag.
+
+
+```bash
+python3 -m sglang.launch_server \
+ --model LiquidAI/LFM2.5-1.2B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --enable-torch-compile \
+ --cuda-graph-max-bs 4 \
+ --chunked-prefill-size -1
+```
+
+On B300/CUDA 13, use the dedicated Docker image:
+
+```bash
+docker run --gpus all \
+ --shm-size 32g \
+ -p 30000:30000 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HF_TOKEN=" \
+ --ipc=host \
+ lmsysorg/sglang:dev-cu13 \
+ python3 -m sglang.launch_server \
+ --model-path LiquidAI/LFM2.5-1.2B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --enable-torch-compile \
+ --cuda-graph-max-bs 4 \
+ --chunked-prefill-size -1
+```
+
+With this configuration, end-to-end latency can be as low as **180ms** per request. Benchmark results on a B300 GPU with CUDA 13:
+
+```
+============ Serving Benchmark Result ============
+Backend: sglang-oai-chat
+Successful requests: 256
+Benchmark duration (s): 49.16
+Total input tokens: 82131
+Total generated tokens: 54126
+Request throughput (req/s): 5.21
+Input token throughput (tok/s): 1670.54
+Output token throughput (tok/s): 1100.92
+Total token throughput (tok/s): 2771.47
+----------------End-to-End Latency----------------
+Mean E2E Latency (ms): 191.88
+Median E2E Latency (ms): 119.90
+P99 E2E Latency (ms): 760.65
+---------------Time to First Token----------------
+Mean TTFT (ms): 8.79
+Median TTFT (ms): 8.11
+P99 TTFT (ms): 17.45
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms): 0.86
+Median TPOT (ms): 0.86
+P99 TPOT (ms): 0.89
+==================================================
+```
+
+
+ ```bash
+ python3 -m sglang.bench_serving \
+ --backend sglang-oai-chat \
+ --num-prompts 256 \
+ --max-concurrency 1 \
+ --random-input-len 1024 \
+ --random-output-len 128 \
+ --warmup-requests 128
+ ```
+
+
+## Chat Completions
+
+Once the server is running, use the OpenAI Python client or any OpenAI-compatible tool:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+response = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-1.2B-Instruct",
+ messages=[
+ {"role": "user", "content": "What is machine learning?"}
+ ],
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=512,
+)
+
+print(response.choices[0].message.content)
+```
+
+### Streaming
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+stream = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-1.2B-Instruct",
+ messages=[
+ {"role": "user", "content": "Tell me a story."}
+ ],
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=512,
+ stream=True,
+)
+
+for chunk in stream:
+ if chunk.choices[0].delta.content is not None:
+ print(chunk.choices[0].delta.content, end="")
+```
+
+### Multi-turn Conversations
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+response = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-1.2B-Instruct",
+ messages=[
+ {
+ "role": "system",
+ "content": "You are a knowledgeable historian who provides concise responses.",
+ },
+ {"role": "user", "content": "Tell me about ancient Rome"},
+ {
+ "role": "assistant",
+ "content": "Ancient Rome was a civilization centered in Italy.",
+ },
+ {"role": "user", "content": "What were their major achievements?"},
+ ],
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=128,
+)
+
+print(response.choices[0].message.content)
+```
+
+
+ ```bash
+ curl http://localhost:30000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "LiquidAI/LFM2.5-1.2B-Instruct",
+ "messages": [
+ {"role": "user", "content": "What is AI?"}
+ ],
+ "temperature": 0.3,
+ "min_p": 0.15,
+ "repetition_penalty": 1.05,
+ "max_tokens": 256
+ }'
+ ```
+
+
+## Tool Calling
+
+SGLang supports tool calling (function calling) with LFM models via the `--tool-call-parser` flag. Launch the server with tool calling enabled:
+
+```bash
+python3 -m sglang.launch_server \
+ --model LiquidAI/LFM2.5-1.2B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --chunked-prefill-size -1 \
+ --tool-call-parser lfm2
+```
+
+Then use the OpenAI tools API:
+
+```python
+import json
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_candidate_status",
+ "description": "Retrieves the current status of a candidate in the recruitment process",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "candidate_id": {
+ "type": "string",
+ "description": "Unique identifier for the candidate",
+ }
+ },
+ "required": ["candidate_id"],
+ },
+ },
+ }
+]
+
+messages = [
+ {"role": "user", "content": "What is the current status of candidate ID 12345?"}
+]
+
+response = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-1.2B-Instruct",
+ messages=messages,
+ tools=tools,
+ tool_choice="auto",
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=512,
+)
+
+# The model may return a tool call
+tool_calls = response.choices[0].message.tool_calls
+if tool_calls:
+ print(f"Function: {tool_calls[0].function.name}")
+ print(f"Arguments: {tool_calls[0].function.arguments}")
+```
+
+For more details on tool parsing configuration, see the [SGLang Tool Parser documentation](https://docs.sglang.io/advanced_features/tool_parser.html).
+
+## Vision Models
+
+### Installation for Vision Models
+
+To use LFM Vision Models with SGLang, install the required transformers version:
+
+```bash
+pip install transformers==5.0.0
+```
+
+
+Transformers v5 is newly released. If you encounter issues, fall back to the pinned git source:
+```bash
+pip install git+https://github.com/huggingface/transformers.git@3c2517727ce28a30f5044e01663ee204deb1cdbe
+```
+
+
+### Launching the Server
+
+Serve the vision model with `--trust-remote-code`:
+
+```bash
+python3 -m sglang.launch_server \
+ --model LiquidAI/LFM2.5-VL-1.6B \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --trust-remote-code
+```
+
+### OpenAI-Compatible API
+
+Then use the OpenAI client with image content:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+response = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-VL-1.6B",
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Describe what you see in this image."},
+ {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/val2017/000000039769.jpg"}}
+ ]
+ }
+ ],
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=512,
+)
+
+print(response.choices[0].message.content)
+```
+
+You can also pass base64-encoded images:
+
+```python
+import base64
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="None"
+)
+
+# Load and encode image
+with open("path/to/image.jpg", "rb") as f:
+ image_base64 = base64.b64encode(f.read()).decode()
+
+# Chat completion with image
+response = client.chat.completions.create(
+ model="LiquidAI/LFM2.5-VL-1.6B",
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Describe this image in detail."},
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
+ ]
+ }
+ ],
+ temperature=0.3,
+ min_p=0.15,
+ repetition_penalty=1.05,
+ max_tokens=512,
+)
+
+print(response.choices[0].message.content)
+```
From 48724e45662b03428bd7bddbd93329548bbdbfbf Mon Sep 17 00:00:00 2001
From: vincentzed <207368749+vincentzed@users.noreply.github.com>
Date: Fri, 30 Jan 2026 23:06:54 -0500
Subject: [PATCH 2/2] WIP
Signed-off-by: vincentzed <207368749+vincentzed@users.noreply.github.com>
---
docs/inference/sglang.mdx | 21 ++++++++-------------
1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/docs/inference/sglang.mdx b/docs/inference/sglang.mdx
index 0bb3408..65bed78 100644
--- a/docs/inference/sglang.mdx
+++ b/docs/inference/sglang.mdx
@@ -59,8 +59,6 @@ python3 -m sglang.launch_server \
Optional parameters:
* `--chunked-prefill-size -1`: Disables chunked prefill for lower latency
-* `--max-model-len L`: Set maximum context length
-* `--dtype auto`: Automatically select the data type
### Ultra Low Latency on Blackwell (B300)
@@ -76,7 +74,6 @@ python3 -m sglang.launch_server \
--host 0.0.0.0 \
--port 30000 \
--enable-torch-compile \
- --cuda-graph-max-bs 4 \
--chunked-prefill-size -1
```
@@ -95,7 +92,6 @@ docker run --gpus all \
--host 0.0.0.0 \
--port 30000 \
--enable-torch-compile \
- --cuda-graph-max-bs 4 \
--chunked-prefill-size -1
```
@@ -159,7 +155,7 @@ response = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=512,
+
)
print(response.choices[0].message.content)
@@ -183,7 +179,7 @@ stream = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=512,
+
stream=True,
)
@@ -219,7 +215,7 @@ response = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=128,
+
)
print(response.choices[0].message.content)
@@ -236,8 +232,7 @@ print(response.choices[0].message.content)
],
"temperature": 0.3,
"min_p": 0.15,
- "repetition_penalty": 1.05,
- "max_tokens": 256
+ "repetition_penalty": 1.05
}'
```
@@ -298,7 +293,7 @@ response = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=512,
+
)
# The model may return a tool call
@@ -329,7 +324,7 @@ pip install git+https://github.com/huggingface/transformers.git@3c2517727ce28a30
### Launching the Server
-Serve the vision model with `--trust-remote-code`:
+Serve the vision model with `--trust-remote-code` (required because the vision processor code is loaded from the remote Hugging Face repository):
```bash
python3 -m sglang.launch_server \
@@ -365,7 +360,7 @@ response = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=512,
+
)
print(response.choices[0].message.content)
@@ -401,7 +396,7 @@ response = client.chat.completions.create(
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
- max_tokens=512,
+
)
print(response.choices[0].message.content)