....
import random
# 在 partition 前加一句:
arr[random.randint(low, high)], arr[high] = arr[high], arr[random.randint(low, high)]
2.
==================================================
Throughput: 13.28 tok/s
Download complete: : 0.00B [02:34, ?B/s]
a1-6@192 dflash %
#!/usr/bin/env python3
"""DFlash MLX 测试脚本"""
from dflash.model_mlx import load, load_draft, stream_generate
# 加载主模型和草稿模型
print("Loading target model: mlx-community/Qwen3.6-27B-4bit...")
model, tokenizer = load("mlx-community/Qwen3.6-27B-4bit")
print("Loading draft model: z-lab/Qwen3.6-27B-DFlash...")
draft = load_draft("z-lab/Qwen3.6-27B-DFlash")
# 准备 prompt
messages = [{"role": "user", "content": "写个 Python 快排"}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True
)
print(f"\nPrompt: {messages[0]['content']}")
print("=" * 50)
# 生成
tps = 0.0
print("\n生成结果:\n")
for r in stream_generate(
model,
draft,
tokenizer,
prompt,
block_size=16,
max_tokens=2048,
temperature=0.6
):
print(r.text, end="", flush=True)
tps = r.generation_tps
print(f"\n\n{'=' * 50}")
print(f"Throughput: {tps:.2f} tok/s")
python test_mlx.py
same Qwen3.6-27