From 14651ed645735860828f23af6f06593c1a37f58c Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Wed, 24 Jun 2026 15:51:48 +0800 Subject: [PATCH 1/8] Add OPSD (On-Policy Distillation) training example Entry point, configs, data, and tests for on-policy distillation using DeepSpeed's hybrid engine rollout and vLLM backend. Signed-off-by: Guokai Ma Signed-off-by: Guokai Ma --- training/opsd/README.md | 222 ++++++++++++++++ training/opsd/configs/ds_zero3.json | 43 ++++ training/opsd/configs/opsd_hybrid_engine.json | 48 ++++ training/opsd/configs/opsd_vllm_disjoint.json | 54 ++++ training/opsd/configs/smoke_ds_zero0.json | 20 ++ training/opsd/configs/smoke_ds_zero3.json | 35 +++ training/opsd/configs/smoke_hybrid.json | 48 ++++ training/opsd/configs/smoke_hybrid_gc.json | 49 ++++ training/opsd/configs/smoke_vllm.json | 57 +++++ training/opsd/data/prompts.jsonl | 238 ++++++++++++++++++ training/opsd/main.py | 134 ++++++++++ training/opsd/requirements.txt | 5 + training/opsd/scripts/train_opsd_hybrid.sh | 14 ++ training/opsd/scripts/train_opsd_vllm.sh | 24 ++ training/opsd/tests/test_losses.py | 166 ++++++++++++ training/opsd/tests/test_teacher_caching.py | 101 ++++++++ 16 files changed, 1258 insertions(+) create mode 100644 training/opsd/README.md create mode 100644 training/opsd/configs/ds_zero3.json create mode 100644 training/opsd/configs/opsd_hybrid_engine.json create mode 100644 training/opsd/configs/opsd_vllm_disjoint.json create mode 100644 training/opsd/configs/smoke_ds_zero0.json create mode 100644 training/opsd/configs/smoke_ds_zero3.json create mode 100644 training/opsd/configs/smoke_hybrid.json create mode 100644 training/opsd/configs/smoke_hybrid_gc.json create mode 100644 training/opsd/configs/smoke_vllm.json create mode 100644 training/opsd/data/prompts.jsonl create mode 100644 training/opsd/main.py create mode 100644 training/opsd/requirements.txt create mode 100644 training/opsd/scripts/train_opsd_hybrid.sh create mode 100644 training/opsd/scripts/train_opsd_vllm.sh create mode 100644 training/opsd/tests/test_losses.py create mode 100644 training/opsd/tests/test_teacher_caching.py diff --git a/training/opsd/README.md b/training/opsd/README.md new file mode 100644 index 000000000..3fce93c36 --- /dev/null +++ b/training/opsd/README.md @@ -0,0 +1,222 @@ +# On-Policy Distillation (OPSD) on DeepSpeed + +A DeepSpeed-native port of [HJSang/OPSD_OnPolicyDistillation](https://github.com/HJSang/OPSD_OnPolicyDistillation), +removing the verl dependency and building directly on DeepSpeed primitives +(ZeRO-3, hybrid engine, `deepspeed.initialize`). + +On-policy distillation trains a small **student** model to imitate a large +frozen **teacher** on the student's *own* generated rollouts. Each training +step has three phases: + +``` +┌────────────┐ prompts ┌──────────────────┐ prompt+response ┌────────────┐ +│ Dataloader │ ──────────▶ │ Student rollout │ ──────────────────▶ │ Teacher │ +└────────────┘ │ (hybrid / vLLM) │ │ forward │ + └──────────────────┘ └─────┬──────┘ + │ logits → CPU cache + ▼ + ┌─────────────────────┐ + │ Student forward + │ + │ streamed KL / JSD + │ + │ backward / step │ + └─────────────────────┘ +``` + +Loss = per-token divergence (`forward_kl` | `reverse_kl` | `jsd`) between +student and teacher distributions on the student's generated tokens, chunked +over the sequence axis so the full `[B, T, V]` teacher tensor never +co-resides with the student logits on the training device. + +## Layout + +``` +examples/opsd/ +├── main.py # entry point (deepspeed launcher) +├── opsd/ +│ ├── config.py # OPSDConfig dataclass + JSON loader +│ ├── losses.py # chunked / streamed KL & JSD +│ ├── teacher.py # frozen teacher + CPU logit cache +│ ├── trainer.py # three-phase training loop +│ ├── data.py # JSONL prompt dataset + left-pad collator +│ ├── utils.py # response-mask + shift helpers +│ └── rollout/ +│ ├── base.py # RolloutEngine ABC, request/batch dataclasses +│ ├── hybrid_engine.py # DeepSpeed hybrid-engine rollout +│ └── vllm.py # vLLM rollout on disjoint GPUs +├── configs/ +│ ├── ds_zero3.json # base DeepSpeed ZeRO-3 + hybrid engine +│ ├── opsd_hybrid_engine.json # production-ish hybrid-engine OPSD config +│ ├── opsd_vllm_disjoint.json # vLLM rollout on a disjoint GPU group +│ ├── smoke_hybrid.json # 5-step smoke test with Qwen2.5-0.5B / 1.5B +│ ├── smoke_vllm.json # same but with vLLM rollout +│ └── smoke_ds_zero3.json # ZeRO-3 config tuned for smoke runs +├── scripts/ +│ ├── train_opsd_hybrid.sh # launch hybrid-engine training +│ └── train_opsd_vllm.sh # launch vLLM training +└── tests/ # CPU-only unit tests (run with pytest) +``` + +## Quick start + +### Install + +``` +pip install deepspeed transformers datasets accelerate +# Optional, only for the vLLM rollout backend: +pip install 'vllm>=0.6.4' +``` + +### Hybrid-engine training (single-node, no vLLM) + +``` +cd examples/opsd +NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json +``` + +The hybrid engine path lives entirely within DeepSpeed: the student engine +both trains and generates, sharing weights without a copy step. Easiest to +get running; slower generation than vLLM. + +### vLLM training (disjoint GPU group) + +``` +cd examples/opsd +# Train on GPUs 0..5, run vLLM on 6,7 (matches default config) +NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \ + bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json +``` + +vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0 +constructs the `LLM` handle; other training ranks receive generated token +ids via NCCL broadcast. + +### Smoke tests (5 steps, small models) + +The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B +(student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated +end-to-end before scaling up. + +``` +cd examples/opsd +deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json +# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM): +NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \ + main.py --config configs/smoke_vllm.json +``` + +## Unit tests + +The CPU-runnable test suite exercises the loss math, teacher caching, rollout +contract, and vLLM stitch logic. Run with: + +``` +cd examples/opsd +python -m pytest tests/ -v +``` + +## Configuration + +`OPSDConfig` is a plain dataclass loaded from JSON (no Hydra). The schema: + +```json +{ + "student": { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" }, + "teacher": { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true }, + "rollout": { "engine": "hybrid_engine | vllm", ... }, + "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 }, + "training": { "train_batch_size": 8, "learning_rate": 1e-6, ... }, + "data": { "path": "data/prompts.jsonl", "prompt_field": "prompt" }, + "deepspeed_config": "configs/ds_zero3.json" +} +``` + +See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json` +for fully-populated examples. + +## Adding a new model architecture + +No special steps are needed for new model architectures. vLLM's RLHF weight +transfer API handles TP slicing internally; the caller only needs to send full +tensors. + +## Design notes + +* **Why CPU-cache the teacher logits?** Holding both student and teacher + `[B, T, V]` tensors on GPU at once doubles memory pressure. Staging the + teacher to host between the teacher forward and the student backward halves + the worst-case GPU footprint of the loss path. The streamed loss + (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU + one sequence slice at a time so the full tensor never re-materialises. + +* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends + have very different lifecycles (hybrid engine reads student weights live; + vLLM holds its own copy and must be synced) but the trainer should not + care. The ABC keeps the trainer engine-agnostic so additional backends + (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching + the loop. + +* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is + simpler to debug — failures in vLLM don't take down training and vice + versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as + a follow-up. + +* **Weight sync uses vLLM's RLHF API.** vLLM 0.22.0+ exposes + ``/update_weights`` which handles TP slicing internally. The trainer + sends full tensors and vLLM distributes them. + +## vLLM status + +The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but +not yet usable under the DeepSpeed launcher**. During live validation on +4× H200 we hit a blocking issue: + +> vLLM's worker init calls `new_group(...)` on the global process group as +> a collective. Under `deepspeed --num_gpus N`, the world is all `N` +> training ranks but only rank 0 calls into vLLM, so the constructor hangs +> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 + +> torch 2.5.1. Standalone vLLM (world size 1) works in seconds. + +The fix requires running vLLM in a **separate top-level Python process** +with its own world, accessed over HTTP/RPC from the trainer — the pattern +used by TRL and OpenRLHF. That's a larger refactor than fits in this PR; +the current `VLLMRollout` will be the basis for it once landed. + +What's verified for the vLLM path today: +* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test) +* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated) + +What's **not** verified: +* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig` +* `LLM.collective_rpc("load_weights", ...)` weight sync at training time + +The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated +end-to-end on the same hardware. + +## Other known limitations (v1) + +* **vLLM weight sync (when it works) goes through pickle** — + `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`. + Expect several seconds per sync on a 7B model. A faster v2 would broadcast + tensors via NCCL on a shared trainer↔vLLM process group — see verl's + `bucketed_weight_transfer.py` for a reference design. +* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's + slicing math is unit-tested but no live run exists. +* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not + ported. Easy to add: scale `per_tok` by a reward weight in the loss path. +* **GRPO and other on-policy RL recipes** are out of scope. The + `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO + trainer would add its own advantage / KL-to-reference logic on top. +* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed. +* **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no + hybrid-engine inference acceleration), since DeepSpeed's inference policy + list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15. + The fallback gathers params via `GatheredParameters` and calls the HF + model's `generate` directly — correct, just ~3-5x slower than the + accelerated path. + +## References + +* OPSD reference repo: +* DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py` +* verl rollout / weight-sync design (used as a cross-check): + diff --git a/training/opsd/configs/ds_zero3.json b/training/opsd/configs/ds_zero3.json new file mode 100644 index 000000000..1f43339a6 --- /dev/null +++ b/training/opsd/configs/ds_zero3.json @@ -0,0 +1,43 @@ +{ + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": 5e7, + "stage3_prefetch_bucket_size": 5e7, + "stage3_param_persistence_threshold": 1e6, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-6, + "betas": [0.9, 0.95], + "eps": 1e-8, + "weight_decay": 0.0 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-6, + "warmup_num_steps": 0 + } + }, + "gradient_clipping": 1.0, + "hybrid_engine": { + "enabled": true, + "max_out_tokens": 2048, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + }, + "wall_clock_breakdown": false +} diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json new file mode 100644 index 000000000..d2ebb8b03 --- /dev/null +++ b/training/opsd/configs/opsd_hybrid_engine.json @@ -0,0 +1,48 @@ +{ + "student": { + "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + }, + "teacher": { + "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + "offload_to_cpu": true + }, + "rollout": { + "engine": "hybrid_engine", + "max_prompt_length": 1024, + "max_response_length": 1024, + "temperature": 0, + "top_p": 1.0, + "top_k": -1, + "n_samples_per_prompt": 1, + "weight_sync_interval": 1 + }, + "distillation": { + "loss_type": "reverse_kl", + "temperature": 0, + "chunk_size": 512 + }, + "training": { + "train_batch_size": 1, + "micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-6, + "weight_decay": 0.0, + "num_train_epochs": 1, + "max_steps": -1, + "warmup_steps": 0, + "save_steps": 500, + "logging_steps": 10, + "save_dir": "./opsd_ckpt_hybrid", + "seed": 42 + }, + "data": { + "path": "data/prompts.jsonl", + "prompt_field": "prompt", + "shuffle": true + }, + "deepspeed_config": "configs/ds_zero3.json" +} diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json new file mode 100644 index 000000000..c98489df6 --- /dev/null +++ b/training/opsd/configs/opsd_vllm_disjoint.json @@ -0,0 +1,54 @@ +{ + "student": { + "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + }, + "teacher": { + "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + "offload_to_cpu": true + }, + "rollout": { + "engine": "vllm", + "max_prompt_length": 1024, + "max_response_length": 1024, + "temperature": 0, + "top_p": 1.0, + "top_k": -1, + "n_samples_per_prompt": 1, + "gpus": [6, 7], + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.85, + "vllm_dtype": "bfloat16", + "weight_sync_interval": 4, + "vllm_min_version": "0.6.4", + "vllm_port": 8000 + }, + "distillation": { + "loss_type": "reverse_kl", + "temperature": 0, + "chunk_size": 512 + }, + "training": { + "train_batch_size": 1, + "micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-6, + "weight_decay": 0.0, + "num_train_epochs": 1, + "max_steps": -1, + "warmup_steps": 0, + "save_steps": 500, + "logging_steps": 10, + "save_dir": "./opsd_ckpt_vllm", + "seed": 42 + }, + "data": { + "path": "data/prompts.jsonl", + "prompt_field": "prompt", + "shuffle": true + }, + "deepspeed_config": "configs/ds_zero3.json" +} diff --git a/training/opsd/configs/smoke_ds_zero0.json b/training/opsd/configs/smoke_ds_zero0.json new file mode 100644 index 000000000..26d9e8495 --- /dev/null +++ b/training/opsd/configs/smoke_ds_zero0.json @@ -0,0 +1,20 @@ +{ + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 0 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-6, + "betas": [0.9, 0.95], + "eps": 1e-8, + "weight_decay": 0.0, + "torch_adam": true + } + }, + "gradient_clipping": 1.0, + "wall_clock_breakdown": false +} diff --git a/training/opsd/configs/smoke_ds_zero3.json b/training/opsd/configs/smoke_ds_zero3.json new file mode 100644 index 000000000..74211f3fb --- /dev/null +++ b/training/opsd/configs/smoke_ds_zero3.json @@ -0,0 +1,35 @@ +{ + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": 5e7, + "stage3_prefetch_bucket_size": 5e7, + "stage3_param_persistence_threshold": 1e6, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-6, + "betas": [0.9, 0.95], + "eps": 1e-8, + "weight_decay": 0.0 + } + }, + "gradient_clipping": 1.0, + "hybrid_engine": { + "enabled": true, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + }, + "wall_clock_breakdown": false +} diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json new file mode 100644 index 000000000..774092926 --- /dev/null +++ b/training/opsd/configs/smoke_hybrid.json @@ -0,0 +1,48 @@ +{ + "student": { + "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + }, + "teacher": { + "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + "offload_to_cpu": false + }, + "rollout": { + "engine": "hybrid_engine", + "max_prompt_length": 128, + "max_response_length": 64, + "temperature": 0, + "top_p": 1.0, + "top_k": -1, + "n_samples_per_prompt": 1, + "weight_sync_interval": 1 + }, + "distillation": { + "loss_type": "reverse_kl", + "temperature": 0, + "chunk_size": 128 + }, + "training": { + "train_batch_size": 1, + "micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-6, + "weight_decay": 0.0, + "num_train_epochs": 1, + "max_steps": 5, + "warmup_steps": 0, + "save_steps": 10000, + "logging_steps": 1, + "save_dir": "./opsd_smoke_hybrid_ckpt", + "seed": 42 + }, + "data": { + "path": "data/prompts.jsonl", + "prompt_field": "prompt", + "shuffle": true + }, + "deepspeed_config": "configs/smoke_ds_zero0.json" +} diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json new file mode 100644 index 000000000..0512c1581 --- /dev/null +++ b/training/opsd/configs/smoke_hybrid_gc.json @@ -0,0 +1,49 @@ +{ + "student": { + "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + }, + "teacher": { + "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + "offload_to_cpu": false + }, + "rollout": { + "engine": "hybrid_engine", + "max_prompt_length": 128, + "max_response_length": 64, + "temperature": 0, + "top_p": 1.0, + "top_k": -1, + "n_samples_per_prompt": 1, + "use_graph_capture": true, + "weight_sync_interval": 1 + }, + "distillation": { + "loss_type": "reverse_kl", + "temperature": 1.0, + "chunk_size": 128 + }, + "training": { + "train_batch_size": 1, + "micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-6, + "weight_decay": 0.0, + "num_train_epochs": 1, + "max_steps": 5, + "warmup_steps": 0, + "save_steps": 10000, + "logging_steps": 1, + "save_dir": "./opsd_smoke_gc_ckpt", + "seed": 42 + }, + "data": { + "path": "data/prompts.jsonl", + "prompt_field": "prompt", + "shuffle": true + }, + "deepspeed_config": "configs/smoke_ds_zero0.json" +} diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json new file mode 100644 index 000000000..fe375e602 --- /dev/null +++ b/training/opsd/configs/smoke_vllm.json @@ -0,0 +1,57 @@ +{ + "student": { + "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + }, + "teacher": { + "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", + "dtype": "bfloat16", + "trust_remote_code": false, + "offload_to_cpu": false + }, + "rollout": { + "engine": "vllm", + "max_prompt_length": 128, + "max_response_length": 64, + "temperature": 0, + "top_p": 1.0, + "top_k": -1, + "n_samples_per_prompt": 1, + "gpus": [], + "tensor_parallel_size": 1, + "gpu_memory_utilization": 0.3, + "vllm_dtype": "bfloat16", + "weight_sync_interval": 2, + "vllm_min_version": "0.6.4", + "vllm_enforce_eager": true, + "vllm_port": 8000, + "vllm_python": "/root/miniconda3/envs/vllm/bin/python", + "weight_transfer_backend": "gdr" + }, + "distillation": { + "loss_type": "reverse_kl", + "temperature": 0, + "chunk_size": 128 + }, + "training": { + "train_batch_size": 1, + "micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-6, + "weight_decay": 0.0, + "num_train_epochs": 1, + "max_steps": 5, + "warmup_steps": 0, + "save_steps": 10000, + "logging_steps": 1, + "save_dir": "./opsd_smoke_vllm_ckpt", + "seed": 42 + }, + "data": { + "path": "data/prompts.jsonl", + "prompt_field": "prompt", + "shuffle": true + }, + "deepspeed_config": "configs/smoke_ds_zero0.json" +} diff --git a/training/opsd/data/prompts.jsonl b/training/opsd/data/prompts.jsonl new file mode 100644 index 000000000..bf0dba878 --- /dev/null +++ b/training/opsd/data/prompts.jsonl @@ -0,0 +1,238 @@ +{"prompt": "Solve: 17 + 25 = ?"} +{"prompt": "What is 12 multiplied by 8?"} +{"prompt": "If a train travels 60 miles per hour for 3 hours, how far does it go?"} +{"prompt": "What is the square root of 144?"} +{"prompt": "Compute 15% of 240."} +{"prompt": "A rectangle has length 7 and width 4. What is its area?"} +{"prompt": "Solve for x: 2x + 5 = 17."} +{"prompt": "What is 7 factorial?"} +{"prompt": "Compute the sum of integers from 1 to 10."} +{"prompt": "What is 2 to the power of 10?"} +{"prompt": "Find the perimeter of a square with side length 9."} +{"prompt": "If 5 apples cost 2.50, what is the cost of 12 apples?"} +{"prompt": "What is the greatest common divisor of 24 and 36?"} +{"prompt": "Convert 0.75 to a fraction in simplest form."} +{"prompt": "If x + y = 10 and x - y = 4, find x and y."} +{"prompt": "What is 1/4 + 1/3?"} +{"prompt": "A circle has radius 5. What is its area?"} +{"prompt": "Compute (3 + 4) * (5 - 2)."} +{"prompt": "What is 81 divided by 9?"} +{"prompt": "If a number doubled is 18, what is the number?"} +{"prompt": "What is 3/5 expressed as a percentage?"} +{"prompt": "Calculate the area of a triangle with base 10 and height 6."} +{"prompt": "What is the least common multiple of 4 and 6?"} +{"prompt": "If a shirt costs 25 after a 20% discount, what was the original price?"} +{"prompt": "Simplify: 2(3x + 4) - x."} +{"prompt": "What is the value of pi rounded to 4 decimal places?"} +{"prompt": "How many sides does a hexagon have?"} +{"prompt": "Compute 2^3 + 3^2."} +{"prompt": "If you roll a standard die, what is the probability of getting a 4?"} +{"prompt": "What is the average of 12, 15, and 18?"} +{"prompt": "Solve: 5x - 3 = 22."} +{"prompt": "What is the volume of a cube with side length 4?"} +{"prompt": "Convert 3 kilometers to meters."} +{"prompt": "What is 13 squared?"} +{"prompt": "If a car uses 8 liters per 100km, how much for 350km?"} +{"prompt": "What is the median of 3, 7, 9, 12, 15?"} +{"prompt": "Calculate 25 * 4 + 30 / 6."} +{"prompt": "What is the factorial of 5?"} +{"prompt": "If 3x = 27, what is x?"} +{"prompt": "What is 10% of 0.5?"} +{"prompt": "Simplify the fraction 18/24."} +{"prompt": "What is the next prime number after 7?"} +{"prompt": "How many degrees are in a right angle?"} +{"prompt": "Compute 1/2 * 3/4."} +{"prompt": "What is the surface area of a cube with side 3?"} +{"prompt": "If a population grows by 10% per year from 1000, what is it after 2 years?"} +{"prompt": "What is the absolute value of -15?"} +{"prompt": "Solve: x^2 = 49."} +{"prompt": "How many minutes are in 2.5 hours?"} +{"prompt": "What is 0.1 + 0.02 + 0.003?"} +{"prompt": "A bag has 3 red and 5 blue marbles. What is the probability of picking red?"} +{"prompt": "What is the perimeter of a rectangle with sides 8 and 12?"} +{"prompt": "Compute the cube root of 27."} +{"prompt": "If y = 2x + 1 and x = 5, what is y?"} +{"prompt": "What is the difference between 100 and 37?"} +{"prompt": "How many edges does a rectangular prism have?"} +{"prompt": "Simplify: (x + 2)(x - 2)."} +{"prompt": "What is 4! divided by 2!?"} +{"prompt": "Convert 5/8 to a decimal."} +{"prompt": "What is the hypotenuse of a right triangle with legs 3 and 4?"} +{"prompt": "What is 999 + 1?"} +{"prompt": "If you save 5 per day, how much in 30 days?"} +{"prompt": "What is the reciprocal of 7?"} +{"prompt": "Compute log10(1000)."} +{"prompt": "A pizza is cut into 8 equal slices. If you eat 3, what fraction remains?"} +{"prompt": "What is the sum of angles in a triangle?"} +{"prompt": "Round 3.14159 to 2 decimal places."} +{"prompt": "What is 50% of 50% of 200?"} +{"prompt": "If a = 3 and b = 4, what is a^2 + b^2?"} +{"prompt": "How many factors does 12 have?"} +{"prompt": "What is the negative of -7?"} +{"prompt": "Express 0.125 as a fraction."} +{"prompt": "What is the slope of the line y = 3x + 5?"} +{"prompt": "A clock shows 3:15. What is the angle between the hour and minute hands?"} +{"prompt": "What is 11 * 11?"} +{"prompt": "If gas costs 3.50 per gallon and you buy 10 gallons, what is the total?"} +{"prompt": "What are the first 3 multiples of 7?"} +{"prompt": "How many zeros are in one million?"} +{"prompt": "What is 2/3 + 2/3?"} +{"prompt": "Compute the area of a circle with diameter 10."} +{"prompt": "If a book has 300 pages and you read 45 per day, how many days to finish?"} +{"prompt": "What is the value of 5^0?"} +{"prompt": "Solve: 4(x - 1) = 20."} +{"prompt": "What is the complement of a 35 degree angle?"} +{"prompt": "How many distinct permutations of the word MATH?"} +{"prompt": "What is 1/10 as a percentage?"} +{"prompt": "If temperature drops from 15C to -3C, what is the change?"} +{"prompt": "What is the greatest common factor of 18 and 30?"} +{"prompt": "A train is 200m long traveling at 20m/s. How long to pass a pole?"} +{"prompt": "What is the sum of the first 5 odd numbers?"} +{"prompt": "Convert 45 degrees Celsius to Fahrenheit."} +{"prompt": "What is 0.001 * 1000?"} +{"prompt": "How many diagonals does a pentagon have?"} +{"prompt": "Simplify: 6 + 3 * 2."} +{"prompt": "What is 20% of 20% of 500?"} +{"prompt": "If you flip a coin 3 times, how many possible outcomes?"} +{"prompt": "What is the ratio of 15 to 25 in simplest form?"} +{"prompt": "Find x if 3/5 = x/25."} +{"prompt": "What is the mean of 2, 4, 6, 8, 10?"} +{"prompt": "What is 7 * 8 + 6 / 2?"} +{"prompt": "A cylinder has radius 3 and height 10. What is its volume?"} +{"prompt": "What is the smallest prime number?"} +{"prompt": "If f(x) = x^2 + 1, what is f(3)?"} +{"prompt": "How many seconds in one hour?"} +{"prompt": "What is the result of 100 mod 7?"} +{"prompt": "Simplify: sqrt(50) / sqrt(2)."} +{"prompt": "What is the distance between points (1,2) and (4,6)?"} +{"prompt": "A recipe needs 2 cups flour for 12 cookies. How many cups for 30 cookies?"} +{"prompt": "What is 1.5 * 2.5?"} +{"prompt": "What is A intersection B if A = {1,2,3} and B = {2,3,4}?"} +{"prompt": "What is the 10th term of the arithmetic sequence 3, 7, 11, 15?"} +{"prompt": "How many cubic centimeters in a cubic meter?"} +{"prompt": "What is the value of 2^10?"} +{"prompt": "Solve the inequality: 2x > 10."} +{"prompt": "What is 3/7 rounded to 2 decimal places?"} +{"prompt": "What is the tangent of 45 degrees?"} +{"prompt": "How many ways to choose 2 items from 5?"} +{"prompt": "What is the product of all integers from 1 to 5?"} +{"prompt": "If 8 workers finish a job in 6 days, how many days for 12 workers?"} +{"prompt": "What is 1000 - 587?"} +{"prompt": "Express 2500 in scientific notation."} +{"prompt": "What is the sum of interior angles of a hexagon?"} +{"prompt": "What is the decimal equivalent of the binary number 1010?"} +{"prompt": "What is the area of a trapezoid with bases 6 and 10 and height 4?"} +{"prompt": "Calculate 15 * 15."} +{"prompt": "What is the supplementary angle of 110 degrees?"} +{"prompt": "A store buys an item for 40 and sells for 60. What is the markup percentage?"} +{"prompt": "Solve: |x - 3| = 5."} +{"prompt": "How many days are in a leap year?"} +{"prompt": "What is the compound interest on 1000 at 10% for 2 years?"} +{"prompt": "If the base of a triangle is 8 and height is 5, what is the area?"} +{"prompt": "What is 100 divided by 3 rounded to 2 decimal places?"} +{"prompt": "What is the 7th Fibonacci number?"} +{"prompt": "Convert 1 mile to feet."} +{"prompt": "What is the LCM of 6, 8, and 12?"} +{"prompt": "Simplify: 4(x + 3) - 2(x - 1)."} +{"prompt": "If 3a + 2b = 16 and a = 4, what is b?"} +{"prompt": "What is the sine of 30 degrees?"} +{"prompt": "How many ways can 4 people sit in a row?"} +{"prompt": "What is 0.5^3?"} +{"prompt": "Find the 20th term of 5, 8, 11, 14"} +{"prompt": "A triangle has sides 3, 4, 5. What type of triangle is it?"} +{"prompt": "What is the absolute difference between -5 and 3?"} +{"prompt": "How many grams in 2.5 kilograms?"} +{"prompt": "What is the product of -3 and -7?"} +{"prompt": "If a clock shows 9:00, what is the angle between the hands?"} +{"prompt": "What is the square root of 81?"} +{"prompt": "What is 1/3 + 1/6 + 1/12?"} +{"prompt": "If x^2 - 4 = 0, what are the solutions?"} +{"prompt": "What is the geometric mean of 4 and 16?"} +{"prompt": "Convert 72 km/h to m/s."} +{"prompt": "What is the value of cos(60 degrees)?"} +{"prompt": "A box has 5 red, 3 green, 2 blue balls. What is P(not red)?"} +{"prompt": "What is 2^0 + 2^1 + 2^2 + 2^3?"} +{"prompt": "Find the slope between points (1, 3) and (4, 9)."} +{"prompt": "What is the sum of the first 20 natural numbers?"} +{"prompt": "What is the value of e rounded to 3 decimal places?"} +{"prompt": "How many total degrees in a quadrilateral?"} +{"prompt": "Simplify: (2^3 * 2^4) / 2^5."} +{"prompt": "What is the probability of drawing a king from a standard deck?"} +{"prompt": "A car travels 180 miles in 3 hours. What is its average speed?"} +{"prompt": "What is the decimal 0.375 as a fraction?"} +{"prompt": "Solve: 2(x + 5) = 3(x - 1)."} +{"prompt": "How many milliliters in 3 liters?"} +{"prompt": "What is the cube of 5?"} +{"prompt": "What is 5/6 as a repeating decimal?"} +{"prompt": "Find the circumference of a circle with radius 7."} +{"prompt": "If 2 pipes fill a tank in 6 and 12 hours, how long together?"} +{"prompt": "What is the coefficient of x in 3x^2 + 5x - 7?"} +{"prompt": "What is the result of (10^3) / (10^-1)?"} +{"prompt": "Find the GCD of 48 and 72."} +{"prompt": "What is the domain of f(x) = sqrt(x)?"} +{"prompt": "Simplify: 8/12 - 3/12."} +{"prompt": "What is the arithmetic mean of the first 10 even numbers?"} +{"prompt": "Convert -40 Celsius to Fahrenheit."} +{"prompt": "What is the median of 1, 3, 5, 7, 9, 11?"} +{"prompt": "What is the next number: 1, 1, 2, 3, 5, 8, 13?"} +{"prompt": "If 4 workers can paint a fence in 8 hours, how long for 2 workers?"} +{"prompt": "What is the cosine of 0 degrees?"} +{"prompt": "A polygon has 9 sides. What is the sum of its interior angles?"} +{"prompt": "What is 1.2 * 10^3 in standard form?"} +{"prompt": "What is the range of the data set 5, 8, 3, 12, 7?"} +{"prompt": "What is the LCM of 4, 5, and 6?"} +{"prompt": "If y varies directly as x and y = 10 when x = 2, find y when x = 7."} +{"prompt": "What is the degree of the polynomial 3x^4 + 2x^2 - x + 5?"} +{"prompt": "How many diagonals does a hexagon have?"} +{"prompt": "What is 75% expressed as a fraction in lowest terms?"} +{"prompt": "How many ounces in 3 pounds?"} +{"prompt": "What is the volume of a sphere with radius 3?"} +{"prompt": "Solve the system: x + y = 8, x - y = 2."} +{"prompt": "A triangle has two angles of 50 and 70 degrees. What is the third angle?"} +{"prompt": "What is the remainder when 100 is divided by 7?"} +{"prompt": "Express 0.04 as a percentage."} +{"prompt": "What is the value of the expression 2 + 3 * 4 - 1?"} +{"prompt": "How many prime numbers are between 10 and 30?"} +{"prompt": "If a laptop costs 800 after 20% off, what was the original price?"} +{"prompt": "What is 5 factorial minus 3 factorial?"} +{"prompt": "Find the length of the diagonal of a rectangle 6 by 8."} +{"prompt": "What is the sine of 90 degrees?"} +{"prompt": "If the ratio of boys to girls is 3:2 and there are 30 students, how many girls?"} +{"prompt": "What is the value of log2(32)?"} +{"prompt": "What is the sum of 1 + 2 + 3 ... + 50?"} +{"prompt": "Convert 40 inches to feet."} +{"prompt": "What is the derivative of x^3?"} +{"prompt": "What is 10^0 + 10^1 + 10^2?"} +{"prompt": "A bag has 4 green, 6 red marbles. What is P(green or red)?"} +{"prompt": "How many multiples of 3 are between 10 and 50?"} +{"prompt": "If 2x + y = 7 and x = 3, what is y?"} +{"prompt": "What is the midpoint of the segment from (2,3) to (8,7)?"} +{"prompt": "Simplify: 2(3x - 1) + 4(x + 2)."} +{"prompt": "How many triangles can be formed from 6 non-collinear points?"} +{"prompt": "What is the 5th root of 32?"} +{"prompt": "What is the mode of 3, 5, 3, 7, 5, 3, 8?"} +{"prompt": "Find the slope of the line passing through (0,0) and (2,6)."} +{"prompt": "What is the supplementary angle of 72 degrees?"} +{"prompt": "How many positive divisors does 36 have?"} +{"prompt": "Simplify: (a + b)^2 - (a - b)^2."} +{"prompt": "How many seconds in 1.5 hours?"} +{"prompt": "If a machine produces 120 items in 8 hours, how many per hour?"} +{"prompt": "What is the inverse of f(x) = 2x + 3?"} +{"prompt": "What is the greatest integer less than sqrt(50)?"} +{"prompt": "What is 4^3 - 3^4?"} +{"prompt": "What is the distance from (0,0) to (3,4)?"} +{"prompt": "If sin(x) = 0.5, what is x in degrees?"} +{"prompt": "A rectangle has area 48 and width 6. What is its length?"} +{"prompt": "How many degrees does the minute hand move in 20 minutes?"} +{"prompt": "What is the probability of rolling a sum of 7 with two dice?"} +{"prompt": "Simplify: 3(x + 2) - 2(x - 4)."} +{"prompt": "What is the value of floor(3.7)?"} +{"prompt": "What is the weighted average of 80 (weight 3) and 90 (weight 7)?"} +{"prompt": "Find the y-intercept of y = 3x - 6."} +{"prompt": "How many sides does a decagon have?"} +{"prompt": "What is the integral of 2x dx?"} +{"prompt": "What is 2 + 2 * 2?"} +{"prompt": "If a triangle has sides 5, 5, 5, what is it called?"} +{"prompt": "What is the decimal for 7/8?"} +{"prompt": "If f(x) = 1/x, what is f(5)?"} +{"prompt": "What is the remainder when 2^10 is divided by 7?"} diff --git a/training/opsd/main.py b/training/opsd/main.py new file mode 100644 index 000000000..534c8ae0a --- /dev/null +++ b/training/opsd/main.py @@ -0,0 +1,134 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""OPSD training entry point. + +Launch with the DeepSpeed launcher:: + + deepspeed --num_gpus 8 main.py --config configs/opsd_hybrid_engine.json + +The DeepSpeed launcher sets ``LOCAL_RANK``, ``RANK``, and ``WORLD_SIZE`` in +the environment; we call :func:`deepspeed.init_distributed` to take that over. +""" + +import argparse +import json +import os +import random + +import deepspeed +import numpy as np +import torch +from deepspeed.accelerator import get_accelerator +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +from deepspeed.runtime.rlhf.config import OPSDConfig +from deepspeed.runtime.rlhf.data import LeftPaddedPromptCollator, PromptDataset +from deepspeed.runtime.rollout import build_rollout +from deepspeed.runtime.rlhf.teacher import TeacherWrapper +from deepspeed.runtime.rlhf.trainer.opsd import OPSDTrainer + + +def _seed_everything(seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if get_accelerator().is_available(): + get_accelerator().manual_seed_all(seed) + + +def _resolve_dtype(name: str) -> torch.dtype: + return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name] + + +def _load_ds_config(path: str) -> dict: + with open(path, "r") as f: + return json.load(f) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--config", required=True, help="Path to OPSDConfig JSON") + parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0))) + args = parser.parse_args() + + cfg = OPSDConfig.from_json(args.config) + cfg.validate() + _seed_everything(cfg.training.seed) + + deepspeed.init_distributed() + + # --- tokenizer (shared between data + rollout) ------------------------- + tokenizer = AutoTokenizer.from_pretrained( + cfg.student.model_name_or_path, + trust_remote_code=cfg.student.trust_remote_code, + padding_side="left", + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # --- student model + DeepSpeed engine ---------------------------------- + student_dtype = _resolve_dtype(cfg.student.dtype) + student_model = AutoModelForCausalLM.from_pretrained( + cfg.student.model_name_or_path, + torch_dtype=student_dtype, + trust_remote_code=cfg.student.trust_remote_code, + ) + + ds_config = _load_ds_config(cfg.deepspeed_config) + ds_config["train_micro_batch_size_per_gpu"] = cfg.training.micro_batch_size_per_gpu + ds_config["train_batch_size"] = cfg.training.train_batch_size + ds_config["gradient_accumulation_steps"] = cfg.training.gradient_accumulation_steps + + student_engine, *_ = deepspeed.initialize( + model=student_model, + model_parameters=student_model.parameters(), + config=ds_config, + ) + + # --- frozen teacher ---------------------------------------------------- + teacher = TeacherWrapper(cfg.teacher, world_size=dist_world_size()) + + # --- rollout engine ---------------------------------------------------- + rollout = build_rollout( + cfg.rollout, + student_engine=student_engine, + tokenizer=tokenizer, + student_model_path=cfg.student.model_name_or_path, + ) + + # --- dataloader -------------------------------------------------------- + dataset = PromptDataset( + path=cfg.data.path, + tokenizer=tokenizer, + max_prompt_length=cfg.rollout.max_prompt_length, + prompt_field=cfg.data.prompt_field, + chat_template=cfg.data.chat_template, + ) + collator = LeftPaddedPromptCollator(tokenizer=tokenizer, max_prompt_length=cfg.rollout.max_prompt_length) + loader = DataLoader( + dataset, + batch_size=cfg.training.micro_batch_size_per_gpu, + shuffle=cfg.data.shuffle, + collate_fn=collator, + drop_last=True, + ) + + OPSDTrainer( + cfg=cfg, + student_engine=student_engine, + teacher=teacher, + tokenizer=tokenizer, + rollout=rollout, + dataloader=loader, + ).train() + + +def dist_world_size() -> int: + return int(os.environ.get("WORLD_SIZE", "1")) + + +if __name__ == "__main__": + main() diff --git a/training/opsd/requirements.txt b/training/opsd/requirements.txt new file mode 100644 index 000000000..fb5a09157 --- /dev/null +++ b/training/opsd/requirements.txt @@ -0,0 +1,5 @@ +datasets>=2.0.0 +numpy +transformers>=4.40.0 +# Optional, only needed when rollout.engine == "vllm": +# vllm>=0.6.4 diff --git a/training/opsd/scripts/train_opsd_hybrid.sh b/training/opsd/scripts/train_opsd_hybrid.sh new file mode 100644 index 000000000..69e3bdc68 --- /dev/null +++ b/training/opsd/scripts/train_opsd_hybrid.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +# +# Launch OPSD training with the DeepSpeed hybrid-engine rollout (no vLLM). +# Assumes you're cd'd into examples/opsd/. +set -euo pipefail + +CONFIG="${1:-configs/opsd_hybrid_engine.json}" +NUM_GPUS="${NUM_GPUS:-8}" + +deepspeed --num_gpus "${NUM_GPUS}" main.py --config "${CONFIG}" diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh new file mode 100644 index 000000000..6ad847954 --- /dev/null +++ b/training/opsd/scripts/train_opsd_vllm.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +# +# Launch OPSD training with vLLM rollout. +# +# The vLLM server is started **lazily** as a subprocess by training rank 0 +# on first use, so no separate vLLM launch step is required. The GPUs +# listed in ``rollout.gpus`` in the config are assigned to the vLLM server +# via ``CUDA_VISIBLE_DEVICES`` in the subprocess environment. +# +# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7 +# run vLLM with TP=2. Adjust configs/opsd_vllm_disjoint.json::rollout.gpus +# and NUM_TRAIN_GPUS to match your topology. +set -euo pipefail + +CONFIG="${1:-configs/opsd_vllm_disjoint.json}" +NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}" +INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}" + +deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \ + main.py --config "${CONFIG}" diff --git a/training/opsd/tests/test_losses.py b/training/opsd/tests/test_losses.py new file mode 100644 index 000000000..41ea92289 --- /dev/null +++ b/training/opsd/tests/test_losses.py @@ -0,0 +1,166 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""CPU-only numerics tests for the distillation divergences. + +These exercise the loss math without needing GPUs, models, or a torchrun +launcher. Run from the example root with:: + + cd examples/opsd && python -m pytest tests/test_losses.py -v +""" + +import pytest +import torch + +from deepspeed.runtime.rlhf.losses import chunked_distillation_loss, per_token_logprobs +from deepspeed.runtime.rlhf.utils import build_response_mask, shift_for_next_token_prediction + + +@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"]) +def test_zero_when_identical(loss_type): + torch.manual_seed(0) + logits = torch.randn(2, 8, 32) + mask = torch.ones(2, 8) + loss = chunked_distillation_loss(logits, logits.clone(), mask, loss_type=loss_type) + assert loss.item() == pytest.approx(0.0, abs=1e-5) + + +@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"]) +def test_positive_when_different(loss_type): + torch.manual_seed(0) + s = torch.randn(2, 8, 32) + t = torch.randn(2, 8, 32) + mask = torch.ones(2, 8) + loss = chunked_distillation_loss(s, t, mask, loss_type=loss_type) + assert loss.item() > 0.0 + + +@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"]) +def test_chunking_equivalent_to_unchunked(loss_type): + torch.manual_seed(0) + s = torch.randn(2, 100, 32) + t = torch.randn(2, 100, 32) + mask = torch.ones(2, 100) + loss_chunked = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10) + loss_whole = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10_000) + assert torch.allclose(loss_chunked, loss_whole, atol=1e-5) + + +def test_mask_excludes_tokens(): + torch.manual_seed(0) + s = torch.randn(2, 8, 32) + t = torch.randn(2, 8, 32) + half_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float32) + loss_direct = chunked_distillation_loss(s[:, :4], t[:, :4], torch.ones(2, 4), loss_type="reverse_kl") + loss_masked = chunked_distillation_loss(s, t, half_mask, loss_type="reverse_kl") + assert torch.allclose(loss_direct, loss_masked, atol=1e-5) + + +def test_gradient_flows_to_student(): + torch.manual_seed(0) + s = torch.randn(2, 8, 32, requires_grad=True) + t = torch.randn(2, 8, 32) + mask = torch.ones(2, 8) + loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl") + loss.backward() + assert s.grad is not None + assert s.grad.abs().sum().item() > 0 + + +def test_gradient_does_not_flow_to_teacher_when_detached(): + torch.manual_seed(0) + s = torch.randn(2, 8, 32, requires_grad=True) + t = torch.randn(2, 8, 32, requires_grad=True) + mask = torch.ones(2, 8) + loss = chunked_distillation_loss(s, t.detach(), mask, loss_type="reverse_kl") + loss.backward() + assert t.grad is None + + +def test_unknown_loss_type_raises(): + s = torch.randn(2, 4, 8) + t = torch.randn(2, 4, 8) + mask = torch.ones(2, 4) + with pytest.raises(ValueError, match="Unknown loss_type"): + chunked_distillation_loss(s, t, mask, loss_type="totally_made_up") + + +def test_shape_mismatch_raises(): + s = torch.randn(2, 4, 8) + t = torch.randn(2, 5, 8) + mask = torch.ones(2, 4) + with pytest.raises(ValueError, match="shape mismatch"): + chunked_distillation_loss(s, t, mask) + + +def test_mask_shape_mismatch_raises(): + s = torch.randn(2, 4, 8) + t = torch.randn(2, 4, 8) + mask = torch.ones(2, 5) + with pytest.raises(ValueError, match="does not match"): + chunked_distillation_loss(s, t, mask) + + +@pytest.mark.parametrize("temperature", [0.5, 1.0, 2.0]) +def test_temperature_changes_loss_but_stays_finite(temperature): + torch.manual_seed(0) + s = torch.randn(2, 8, 32) + t = torch.randn(2, 8, 32) + mask = torch.ones(2, 8) + loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", temperature=temperature) + assert torch.isfinite(loss).item() + + +def test_jsd_is_symmetric(): + torch.manual_seed(0) + a = torch.randn(2, 8, 32) + b = torch.randn(2, 8, 32) + mask = torch.ones(2, 8) + jsd_ab = chunked_distillation_loss(a, b, mask, loss_type="jsd") + jsd_ba = chunked_distillation_loss(b, a, mask, loss_type="jsd") + assert torch.allclose(jsd_ab, jsd_ba, atol=1e-5) + + +def test_all_zero_mask_returns_zero(): + torch.manual_seed(0) + s = torch.randn(2, 8, 32) + t = torch.randn(2, 8, 32) + mask = torch.zeros(2, 8) + loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl") + assert loss.item() == pytest.approx(0.0, abs=1e-6) + + +def test_per_token_logprobs_matches_manual(): + torch.manual_seed(0) + logits = torch.randn(2, 4, 16) + labels = torch.randint(0, 16, (2, 4)) + got = per_token_logprobs(logits, labels) + expected = torch.log_softmax(logits.float(), dim=-1) + expected = expected.gather(-1, labels.unsqueeze(-1)).squeeze(-1) + assert torch.allclose(got, expected, atol=1e-6) + + +def test_build_response_mask_basic(): + attention_mask = torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]) + response_start_idx = torch.tensor([2, 3]) + resp = build_response_mask(response_start_idx, attention_mask) + expected = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]]) + assert torch.equal(resp, expected) + + +def test_build_response_mask_validates_shapes(): + with pytest.raises(ValueError, match="response_start_idx must be 1-D"): + build_response_mask(torch.zeros(2, 2), torch.ones(2, 4)) + with pytest.raises(ValueError, match="attention_mask must be 2-D"): + build_response_mask(torch.zeros(2), torch.ones(4)) + with pytest.raises(ValueError, match="batch"): + build_response_mask(torch.zeros(3), torch.ones(2, 4)) + + +def test_shift_for_next_token_prediction_shapes(): + logits = torch.randn(2, 5, 8) + labels = torch.randint(0, 8, (2, 5)) + sl, sla = shift_for_next_token_prediction(logits, labels) + assert sl.shape == (2, 4, 8) + assert sla.shape == (2, 4) diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py new file mode 100644 index 000000000..36d2fcea8 --- /dev/null +++ b/training/opsd/tests/test_teacher_caching.py @@ -0,0 +1,101 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""CPU-only tests for TeacherLogitCache. + +The ``TeacherWrapper`` itself (which wraps deepspeed+transformers) is not +exercised here because it requires a real model and a DeepSpeed launcher; the +caching/streaming pieces are isolated into ``TeacherLogitCache`` so they can +be tested in isolation. +""" + +import pytest +import torch + +from deepspeed.runtime.rlhf.teacher import TeacherLogitCache + + +def test_round_trip_preserves_values_within_dtype(): + torch.manual_seed(0) + gpu_like = torch.randn(2, 16, 32, dtype=torch.float32) + cache = TeacherLogitCache.from_gpu_logits(gpu_like, store_dtype=torch.bfloat16) + assert cache.shape == (2, 16, 32) + assert cache.dtype == torch.bfloat16 + chunk = cache.chunk_to_device(0, 16, torch.device("cpu"), dtype=torch.float32) + # bf16 round-trip loses precision; check it stays within bf16's worst-case + # relative error rather than asserting exact equality. + assert torch.allclose(chunk, gpu_like, atol=1e-1, rtol=1e-1) + + +def test_chunk_slicing_is_correct(): + torch.manual_seed(0) + src = torch.randn(3, 100, 8) + cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32) + for start, end in [(0, 10), (10, 50), (50, 100), (33, 77)]: + got = cache.chunk_to_device(start, end, torch.device("cpu")) + assert got.shape == (3, end - start, 8) + assert torch.allclose(got, src[:, start:end]) + + +def test_invalid_chunk_bounds_raise(): + cache = TeacherLogitCache.from_gpu_logits(torch.zeros(1, 8, 4), store_dtype=torch.float32) + with pytest.raises(ValueError, match="invalid"): + cache.chunk_to_device(0, 9, torch.device("cpu")) + with pytest.raises(ValueError, match="invalid"): + cache.chunk_to_device(5, 3, torch.device("cpu")) + with pytest.raises(ValueError, match="invalid"): + cache.chunk_to_device(-1, 4, torch.device("cpu")) + + +def test_rejects_non_3d_logits(): + with pytest.raises(ValueError, match="must be 3-D"): + TeacherLogitCache(cpu_logits=torch.zeros(8, 32)) + + +def test_rejects_gpu_resident_logits(): + if not torch.cuda.is_available(): #ignore-cuda + pytest.skip("no CUDA available to construct GPU tensor") + with pytest.raises(ValueError, match="must live on CPU"): + TeacherLogitCache(cpu_logits=torch.zeros(1, 8, 4, device="cuda")) + + +def test_dtype_override_in_chunk_to_device(): + src = torch.randn(2, 8, 16, dtype=torch.float32) + cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32) + chunk = cache.chunk_to_device(0, 8, torch.device("cpu"), dtype=torch.bfloat16) + assert chunk.dtype == torch.bfloat16 + + +def test_free_releases_buffer(): + src = torch.randn(2, 32, 16) + cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32) + assert cache.cpu_logits.numel() == 2 * 32 * 16 + cache.free() + assert cache.cpu_logits.numel() == 0 + + +def test_default_store_dtype_is_bf16(): + src = torch.randn(1, 4, 8) + cache = TeacherLogitCache.from_gpu_logits(src) + assert cache.dtype == torch.bfloat16 + + +def test_streamed_chunked_loss_matches_full_loss(): + """End-to-end check: pulling teacher logits chunk-by-chunk through the + cache yields the same distillation loss as passing the full teacher tensor + to ``chunked_distillation_loss`` directly.""" + from deepspeed.runtime.rlhf.losses import chunked_distillation_loss + + torch.manual_seed(0) + s = torch.randn(2, 64, 32) + t = torch.randn(2, 64, 32) + mask = torch.ones(2, 64) + + direct = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", chunk_size=8) + + cache = TeacherLogitCache.from_gpu_logits(t, store_dtype=torch.float32) + staged_full = cache.chunk_to_device(0, 64, torch.device("cpu"), dtype=torch.float32) + via_cache = chunked_distillation_loss(s, staged_full, mask, loss_type="reverse_kl", chunk_size=8) + + assert torch.allclose(direct, via_cache, atol=1e-6) From 6b8f5843cc6220d2f41c11c206ae7d85c33fdb79 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Wed, 1 Jul 2026 18:47:14 +0800 Subject: [PATCH 2/8] Use ROLLOUT_VISIBLE_DEVICE env var for vLLM GPU placement; rename vllm_dtype to engine_dtype Signed-off-by: Guokai Ma --- training/opsd/README.md | 18 ++++++++++++++---- training/opsd/configs/opsd_vllm_disjoint.json | 3 +-- training/opsd/configs/smoke_vllm.json | 3 +-- training/opsd/scripts/train_opsd_vllm.sh | 13 ++++++++----- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/training/opsd/README.md b/training/opsd/README.md index 3fce93c36..f0fcc6d99 100644 --- a/training/opsd/README.md +++ b/training/opsd/README.md @@ -86,9 +86,11 @@ NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \ bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json ``` -vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0 -constructs the `LLM` handle; other training ranks receive generated token -ids via NCCL broadcast. +vLLM gets dedicated GPUs via the `ROLLOUT_VISIBLE_DEVICE` environment +variable (comma-separated CUDA device indices, e.g. +`ROLLOUT_VISIBLE_DEVICE=6,7`). Training rank 0 spawns the vLLM server as +a subprocess with `CUDA_VISIBLE_DEVICES` set to those devices; other +training ranks receive generated token ids via NCCL broadcast. ### Smoke tests (5 steps, small models) @@ -100,7 +102,8 @@ end-to-end before scaling up. cd examples/opsd deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json # For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM): -NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \ +NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 ROLLOUT_VISIBLE_DEVICE=2,3 \ + deepspeed --num_gpus 2 --include localhost:0,1 \ main.py --config configs/smoke_vllm.json ``` @@ -133,6 +136,13 @@ python -m pytest tests/ -v See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json` for fully-populated examples. +**GPU placement for vLLM rollout:** The GPUs available to the vLLM server +are controlled by the `ROLLOUT_VISIBLE_DEVICE` environment variable +(comma-separated CUDA device indices, e.g. `ROLLOUT_VISIBLE_DEVICE=6,7`), +not by a field in the JSON config. This keeps the vLLM device assignment +decoupled from the DeepSpeed launcher's own `CUDA_VISIBLE_DEVICES` / +`--include` flags, which control only the training ranks. + ## Adding a new model architecture No special steps are needed for new model architectures. vLLM's RLHF weight diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json index c98489df6..63c1ecc03 100644 --- a/training/opsd/configs/opsd_vllm_disjoint.json +++ b/training/opsd/configs/opsd_vllm_disjoint.json @@ -18,10 +18,9 @@ "top_p": 1.0, "top_k": -1, "n_samples_per_prompt": 1, - "gpus": [6, 7], "tensor_parallel_size": 2, "gpu_memory_utilization": 0.85, - "vllm_dtype": "bfloat16", + "engine_dtype": "bfloat16", "weight_sync_interval": 4, "vllm_min_version": "0.6.4", "vllm_port": 8000 diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json index fe375e602..bda234ca0 100644 --- a/training/opsd/configs/smoke_vllm.json +++ b/training/opsd/configs/smoke_vllm.json @@ -18,10 +18,9 @@ "top_p": 1.0, "top_k": -1, "n_samples_per_prompt": 1, - "gpus": [], "tensor_parallel_size": 1, "gpu_memory_utilization": 0.3, - "vllm_dtype": "bfloat16", + "engine_dtype": "bfloat16", "weight_sync_interval": 2, "vllm_min_version": "0.6.4", "vllm_enforce_eager": true, diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh index 6ad847954..f39d659ec 100644 --- a/training/opsd/scripts/train_opsd_vllm.sh +++ b/training/opsd/scripts/train_opsd_vllm.sh @@ -8,17 +8,20 @@ # # The vLLM server is started **lazily** as a subprocess by training rank 0 # on first use, so no separate vLLM launch step is required. The GPUs -# listed in ``rollout.gpus`` in the config are assigned to the vLLM server -# via ``CUDA_VISIBLE_DEVICES`` in the subprocess environment. +# assigned to the vLLM server are controlled by the ROLLOUT_VISIBLE_DEVICE +# environment variable (comma-separated CUDA device indices). The training +# ranks must run on a *different* set of GPUs so the two don't contend for +# memory. # -# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7 -# run vLLM with TP=2. Adjust configs/opsd_vllm_disjoint.json::rollout.gpus -# and NUM_TRAIN_GPUS to match your topology. +# Default topology: ranks 0..5 train on GPUs 0-5 (ZeRO-3), devices 6-7 +# run vLLM with TP=2. Override via: +# ROLLOUT_VISIBLE_DEVICE=... NUM_TRAIN_GPUS=.. INCLUDE_GPUS=.. bash ... set -euo pipefail CONFIG="${1:-configs/opsd_vllm_disjoint.json}" NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}" INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}" +export ROLLOUT_VISIBLE_DEVICE="${ROLLOUT_VISIBLE_DEVICE:-6,7}" deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \ main.py --config "${CONFIG}" From 7580c2805bc62a7f201bc8e9cb7c5ca820703787 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 17:51:45 +0800 Subject: [PATCH 3/8] Remove vLLM path, absorb trainer/config/losses/utils/benchmarks from DeepSpeed - Delete vLLM configs, scripts (opsd_vllm_disjoint.json, smoke_vllm.json, train_opsd_vllm.sh) - Add trainer.py, config.py, losses.py, utils.py (moved from DeepSpeed) - Add benchmarks/ (5 hybrid engine benchmarks moved from DeepSpeed) - Update main.py imports (trainer, config now local) - Update test imports (losses, utils now local) - Rewrite README (remove all vLLM sections) Signed-off-by: Guokai Ma --- training/opsd/README.md | 153 +++---------- training/opsd/benchmarks/bench_14b_rollout.py | 134 +++++++++++ training/opsd/benchmarks/bench_autotp_gc.py | 96 ++++++++ training/opsd/benchmarks/bench_decode_1p1r.py | 180 +++++++++++++++ training/opsd/benchmarks/bench_hybrid_tp.py | 145 ++++++++++++ .../opsd/benchmarks/bench_hybrid_tp_opt.py | 149 +++++++++++++ training/opsd/config.py | 104 +++++++++ training/opsd/configs/opsd_vllm_disjoint.json | 53 ----- training/opsd/configs/smoke_vllm.json | 56 ----- training/opsd/losses.py | 192 ++++++++++++++++ training/opsd/main.py | 4 +- training/opsd/scripts/train_opsd_vllm.sh | 27 --- training/opsd/tests/test_losses.py | 4 +- training/opsd/tests/test_teacher_caching.py | 2 +- training/opsd/trainer.py | 210 ++++++++++++++++++ training/opsd/utils.py | 52 +++++ 16 files changed, 1295 insertions(+), 266 deletions(-) create mode 100644 training/opsd/benchmarks/bench_14b_rollout.py create mode 100644 training/opsd/benchmarks/bench_autotp_gc.py create mode 100644 training/opsd/benchmarks/bench_decode_1p1r.py create mode 100644 training/opsd/benchmarks/bench_hybrid_tp.py create mode 100644 training/opsd/benchmarks/bench_hybrid_tp_opt.py create mode 100644 training/opsd/config.py delete mode 100644 training/opsd/configs/opsd_vllm_disjoint.json delete mode 100644 training/opsd/configs/smoke_vllm.json create mode 100644 training/opsd/losses.py delete mode 100644 training/opsd/scripts/train_opsd_vllm.sh create mode 100644 training/opsd/trainer.py create mode 100644 training/opsd/utils.py diff --git a/training/opsd/README.md b/training/opsd/README.md index f0fcc6d99..392e2c3ce 100644 --- a/training/opsd/README.md +++ b/training/opsd/README.md @@ -11,7 +11,7 @@ step has three phases: ``` ┌────────────┐ prompts ┌──────────────────┐ prompt+response ┌────────────┐ │ Dataloader │ ──────────▶ │ Student rollout │ ──────────────────▶ │ Teacher │ -└────────────┘ │ (hybrid / vLLM) │ │ forward │ +└────────────┘ │ (hybrid engine) │ │ forward │ └──────────────────┘ └─────┬──────┘ │ logits → CPU cache ▼ @@ -30,29 +30,17 @@ co-resides with the student logits on the training device. ## Layout ``` -examples/opsd/ +training/opsd/ ├── main.py # entry point (deepspeed launcher) -├── opsd/ -│ ├── config.py # OPSDConfig dataclass + JSON loader -│ ├── losses.py # chunked / streamed KL & JSD -│ ├── teacher.py # frozen teacher + CPU logit cache -│ ├── trainer.py # three-phase training loop -│ ├── data.py # JSONL prompt dataset + left-pad collator -│ ├── utils.py # response-mask + shift helpers -│ └── rollout/ -│ ├── base.py # RolloutEngine ABC, request/batch dataclasses -│ ├── hybrid_engine.py # DeepSpeed hybrid-engine rollout -│ └── vllm.py # vLLM rollout on disjoint GPUs +├── trainer.py # three-phase OPSD training loop ├── configs/ │ ├── ds_zero3.json # base DeepSpeed ZeRO-3 + hybrid engine │ ├── opsd_hybrid_engine.json # production-ish hybrid-engine OPSD config -│ ├── opsd_vllm_disjoint.json # vLLM rollout on a disjoint GPU group │ ├── smoke_hybrid.json # 5-step smoke test with Qwen2.5-0.5B / 1.5B -│ ├── smoke_vllm.json # same but with vLLM rollout │ └── smoke_ds_zero3.json # ZeRO-3 config tuned for smoke runs +├── benchmarks/ # rollout / decode micro-benchmarks ├── scripts/ -│ ├── train_opsd_hybrid.sh # launch hybrid-engine training -│ └── train_opsd_vllm.sh # launch vLLM training +│ └── train_opsd_hybrid.sh # launch hybrid-engine training └── tests/ # CPU-only unit tests (run with pytest) ``` @@ -62,58 +50,35 @@ examples/opsd/ ``` pip install deepspeed transformers datasets accelerate -# Optional, only for the vLLM rollout backend: -pip install 'vllm>=0.6.4' ``` -### Hybrid-engine training (single-node, no vLLM) +### Hybrid-engine training ``` -cd examples/opsd +cd training/opsd NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json ``` The hybrid engine path lives entirely within DeepSpeed: the student engine -both trains and generates, sharing weights without a copy step. Easiest to -get running; slower generation than vLLM. - -### vLLM training (disjoint GPU group) - -``` -cd examples/opsd -# Train on GPUs 0..5, run vLLM on 6,7 (matches default config) -NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \ - bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json -``` - -vLLM gets dedicated GPUs via the `ROLLOUT_VISIBLE_DEVICE` environment -variable (comma-separated CUDA device indices, e.g. -`ROLLOUT_VISIBLE_DEVICE=6,7`). Training rank 0 spawns the vLLM server as -a subprocess with `CUDA_VISIBLE_DEVICES` set to those devices; other -training ranks receive generated token ids via NCCL broadcast. +both trains and generates, sharing weights without a copy step. ### Smoke tests (5 steps, small models) -The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B +The `smoke_hybrid.json` config runs on 2 GPUs in a few minutes with Qwen2.5-0.5B (student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated end-to-end before scaling up. ``` -cd examples/opsd +cd training/opsd deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json -# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM): -NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 ROLLOUT_VISIBLE_DEVICE=2,3 \ - deepspeed --num_gpus 2 --include localhost:0,1 \ - main.py --config configs/smoke_vllm.json ``` ## Unit tests -The CPU-runnable test suite exercises the loss math, teacher caching, rollout -contract, and vLLM stitch logic. Run with: +The CPU-runnable test suite exercises the loss math and teacher caching. Run with: ``` -cd examples/opsd +cd training/opsd python -m pytest tests/ -v ``` @@ -123,9 +88,9 @@ python -m pytest tests/ -v ```json { - "student": { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" }, + "student": { "model_name_or_path": "...", "dtype": "bfloat16" }, "teacher": { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true }, - "rollout": { "engine": "hybrid_engine | vllm", ... }, + "rollout": { "engine": "hybrid_engine", ... }, "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 }, "training": { "train_batch_size": 8, "learning_rate": 1e-6, ... }, "data": { "path": "data/prompts.jsonl", "prompt_field": "prompt" }, @@ -133,21 +98,7 @@ python -m pytest tests/ -v } ``` -See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json` -for fully-populated examples. - -**GPU placement for vLLM rollout:** The GPUs available to the vLLM server -are controlled by the `ROLLOUT_VISIBLE_DEVICE` environment variable -(comma-separated CUDA device indices, e.g. `ROLLOUT_VISIBLE_DEVICE=6,7`), -not by a field in the JSON config. This keeps the vLLM device assignment -decoupled from the DeepSpeed launcher's own `CUDA_VISIBLE_DEVICES` / -`--include` flags, which control only the training ranks. - -## Adding a new model architecture - -No special steps are needed for new model architectures. vLLM's RLHF weight -transfer API handles TP slicing internally; the caller only needs to send full -tensors. +See `configs/opsd_hybrid_engine.json` for a fully-populated example. ## Design notes @@ -158,65 +109,11 @@ tensors. (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU one sequence slice at a time so the full tensor never re-materialises. -* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends - have very different lifecycles (hybrid engine reads student weights live; - vLLM holds its own copy and must be synced) but the trainer should not - care. The ABC keeps the trainer engine-agnostic so additional backends - (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching - the loop. - -* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is - simpler to debug — failures in vLLM don't take down training and vice - versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as - a follow-up. - -* **Weight sync uses vLLM's RLHF API.** vLLM 0.22.0+ exposes - ``/update_weights`` which handles TP slicing internally. The trainer - sends full tensors and vLLM distributes them. - -## vLLM status - -The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but -not yet usable under the DeepSpeed launcher**. During live validation on -4× H200 we hit a blocking issue: - -> vLLM's worker init calls `new_group(...)` on the global process group as -> a collective. Under `deepspeed --num_gpus N`, the world is all `N` -> training ranks but only rank 0 calls into vLLM, so the constructor hangs -> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 + -> torch 2.5.1. Standalone vLLM (world size 1) works in seconds. - -The fix requires running vLLM in a **separate top-level Python process** -with its own world, accessed over HTTP/RPC from the trainer — the pattern -used by TRL and OpenRLHF. That's a larger refactor than fits in this PR; -the current `VLLMRollout` will be the basis for it once landed. - -What's verified for the vLLM path today: -* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test) -* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated) - -What's **not** verified: -* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig` -* `LLM.collective_rpc("load_weights", ...)` weight sync at training time - -The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated -end-to-end on the same hardware. - -## Other known limitations (v1) - -* **vLLM weight sync (when it works) goes through pickle** — - `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`. - Expect several seconds per sync on a 7B model. A faster v2 would broadcast - tensors via NCCL on a shared trainer↔vLLM process group — see verl's - `bucketed_weight_transfer.py` for a reference design. -* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's - slicing math is unit-tested but no live run exists. -* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not - ported. Easy to add: scale `per_tok` by a reward weight in the loss path. -* **GRPO and other on-policy RL recipes** are out of scope. The - `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO - trainer would add its own advantage / KL-to-reference logic on top. -* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed. +* **Why an abstract `RolloutEngine`?** The ABC keeps the trainer + engine-agnostic so additional backends can be added without touching the + training loop. DeepSpeed provides the `HybridEngineRollout` implementation; + external frameworks may plug in their own. + * **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no hybrid-engine inference acceleration), since DeepSpeed's inference policy list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15. @@ -224,9 +121,15 @@ end-to-end on the same hardware. model's `generate` directly — correct, just ~3-5x slower than the accelerated path. +## Other known limitations + +* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not + ported. Easy to add: scale `per_tok` by a reward weight in the loss path. +* **GRPO and other on-policy RL recipes** are out of scope. The + `RolloutEngine` abstraction is reusable, but a GRPO trainer would add its + own advantage / KL-to-reference logic on top. + ## References * OPSD reference repo: * DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py` -* verl rollout / weight-sync design (used as a cross-check): - diff --git a/training/opsd/benchmarks/bench_14b_rollout.py b/training/opsd/benchmarks/bench_14b_rollout.py new file mode 100644 index 000000000..d66c7615d --- /dev/null +++ b/training/opsd/benchmarks/bench_14b_rollout.py @@ -0,0 +1,134 @@ +"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC.""" +import time +import os +import sys +import torch +import deepspeed +from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL = "Qwen/Qwen2.5-14B-Instruct" +MAX_NEW_TOKENS = 256 +N_SAMPLES = 1 +CB_SIZE = 1 +N_RUNS = 5 +PROMPT = "def fibonacci(n):" + + +def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label): + rank = torch.distributed.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + device = torch.device(f"cuda:{local_rank}") + + rollout = HybridEngineRollout( + engine=engine, + tokenizer=tokenizer, + continuous_batching_size=cb_size, + use_graph_capture=use_graph_capture, + ) + + ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device) + req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) + sampling = SamplingConfig( + max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95, + n_samples_per_prompt=N_SAMPLES + ) + + # Warmup + torch.manual_seed(42) + engine.eval() + rollout.generate(req, sampling) + engine.train() + + # Benchmark + times = [] + total_toks = 0 + for i in range(N_RUNS): + torch.manual_seed(42 + i) + engine.eval() + torch.cuda.synchronize() + t0 = time.time() + batch = rollout.generate(req, sampling) + torch.cuda.synchronize() + times.append(time.time() - t0) + engine.train() + + # Count tokens from last run + pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id + for i in range(batch.input_ids.shape[0]): + resp = batch.input_ids[i, batch.response_start_idx[i]:] + total_toks += (resp != pad_id).sum().item() + + t_avg = sum(times[1:]) / len(times[1:]) + + if rank == 0: + print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s " + f"runs={[f'{t*1000:.0f}' for t in times]}") + + return total_toks, t_avg + + +def main(): + deepspeed.init_distributed() + rank = torch.distributed.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) + + world_size = torch.distributed.get_world_size() + tp_size = world_size # all GPUs used for TP + + tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True) + + ds_config = { + "bf16": {"enabled": True}, + "zero_optimization": {"stage": 0}, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": world_size, + "gradient_accumulation_steps": 1, + "hybrid_engine": { + "enabled": True, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": False, + "pin_parameters": True, + "tp_gather_partition_size": 8, + }, + } + + if tp_size > 1: + ds_config["tensor_parallel"] = { + "autotp_size": tp_size, + "preset_model": "qwen2", + "tp": {"tp_size": tp_size}, + } + + engine, *_ = deepspeed.initialize(model=model, config=ds_config) + + if rank == 0: + print(f"\n{'='*60}") + print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}") + print(f"{'='*60}") + + # 1P1R without graph capture (CB=1, no GC) + try: + bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}") + except Exception as e: + if rank == 0: + print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}") + import traceback; traceback.print_exc() + + # 1P1R with CUDA graph capture + try: + bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC") + except Exception as e: + if rank == 0: + print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}") + import traceback; traceback.print_exc() + + if rank == 0: + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/training/opsd/benchmarks/bench_autotp_gc.py b/training/opsd/benchmarks/bench_autotp_gc.py new file mode 100644 index 000000000..c9a245b24 --- /dev/null +++ b/training/opsd/benchmarks/bench_autotp_gc.py @@ -0,0 +1,96 @@ +"""Benchmark rollout with AutoTP + graph capture on 14B model.""" +import time +import torch +import deepspeed +from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig +from transformers import AutoModelForCausalLM, AutoTokenizer + +def main(): + deepspeed.init_distributed() + rank = torch.distributed.get_rank() + local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + + model_name = "Qwen/Qwen2.5-14B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + model = AutoModelForCausalLM.from_pretrained( + model_name, dtype=torch.bfloat16, trust_remote_code=True + ) + + ds_config = { + "bf16": {"enabled": True}, + "zero_optimization": {"stage": 0}, + "tensor_parallel": { + "autotp_size": 2, + "preset_model": "qwen2", + "tp": {"tp_size": 2}, + }, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 2, + "gradient_accumulation_steps": 1, + "hybrid_engine": { + "enabled": True, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": False, + "pin_parameters": True, + "tp_gather_partition_size": 8, + }, + } + + engine, *_ = deepspeed.initialize(model=model, config=ds_config) + + rollout = HybridEngineRollout( + engine=engine, + tokenizer=tokenizer, + continuous_batching_size=2, + use_graph_capture=True, + ) + + # Prepare prompt + prompt = "def fibonacci(n):" + ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) + sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4) + + # Warmup + torch.manual_seed(42) + engine.eval() + rollout.generate(req, sampling) + engine.train() + + # Benchmark + times = [] + for i in range(5): + torch.manual_seed(42) + engine.eval() + torch.cuda.synchronize() + t0 = time.time() + batch = rollout.generate(req, sampling) + torch.cuda.synchronize() + times.append(time.time() - t0) + engine.train() + + t_avg = sum(times[1:]) / len(times[1:]) + # Count tokens + pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id + total_toks = 0 + for i in range(batch.input_ids.shape[0]): + resp = batch.input_ids[i, batch.response_start_idx[i]:] + total_toks += (resp != pad_id).sum().item() + + if rank == 0: + print(f"\n{'='*60}") + print(f"Model: {model_name}") + print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256") + print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms") + print(f"Total response tokens: {total_toks}") + print(f"Throughput: {total_toks/t_avg:.1f} tok/s") + print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/training/opsd/benchmarks/bench_decode_1p1r.py b/training/opsd/benchmarks/bench_decode_1p1r.py new file mode 100644 index 000000000..58fb667d4 --- /dev/null +++ b/training/opsd/benchmarks/bench_decode_1p1r.py @@ -0,0 +1,180 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Micro-benchmark for 1p1r HybridEngineRollout decode. + +Measures time breakdown of each decode step: + - model forward (attention + FFN) + - sampling (softmax + multinomial) + - Python overhead (mask concat, state update, etc.) + +Usage: + python examples/opsd/bench_decode_1p1r.py --model Qwen/Qwen2.5-0.5B-Instruct +""" + +import argparse +import time + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from deepspeed.accelerator import get_accelerator + +from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout +from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig + + +def bench_decode_raw(model, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10): + """Raw decode loop benchmark — measures each component separately.""" + model.eval() + model_dtype = next(model.parameters()).dtype + + input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) + attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) + + results = { + "prompt_len": prompt_len, + "max_new_tokens": max_new_tokens, + "model_dtype": str(model_dtype), + } + + timings = {"prefill": [], "decode_forward": [], "sampling": [], "overhead": [], "total": []} + + for _ in range(num_warmup + num_iters): + with torch.no_grad(): + t0 = time.perf_counter() + out = model(input_ids, attention_mask=attn_mask, use_cache=True) + past = out.past_key_values + logits = out.logits[:, -1:, :] + t_prefill = time.perf_counter() + + generated = [] + cur_token = logits.argmax(dim=-1) + generated.append(cur_token) + cur_mask = attn_mask + + decode_times = [] + sample_times = [] + overhead_times = [] + + for step in range(max_new_tokens): + t_step = time.perf_counter() + cur_mask = torch.cat([cur_mask, torch.ones(1, 1, dtype=torch.long, device=device)], dim=1) + pos_ids = torch.tensor([[prompt_len + step]], device=device) + + t_fwd = time.perf_counter() + out = model(cur_token, + attention_mask=cur_mask, + position_ids=pos_ids, + past_key_values=past, + use_cache=True) + past = out.past_key_values + t_fwd_end = time.perf_counter() + + next_logits = out.logits[:, -1, :] + probs = torch.softmax(next_logits / 1.0, dim=-1) + cur_token = torch.multinomial(probs, 1) + t_sample = time.perf_counter() + + generated.append(cur_token) + t_overhead = time.perf_counter() + + decode_times.append(t_fwd_end - t_fwd) + sample_times.append(t_sample - t_fwd_end) + overhead_times.append(t_overhead - t_sample) + + t_total = time.perf_counter() + + timings["prefill"].append(t_prefill - t0) + timings["decode_forward"].append(decode_times) + timings["sampling"].append(sample_times) + timings["overhead"].append(overhead_times) + timings["total"].append(t_total - t0) + + import numpy as np + + def avg_last_n(lst, n): + return np.mean(lst[-n:]) + + def avg_of_avg(list_of_lists, n): + arrs = [np.array(ls[-n:]) for ls in list_of_lists] + return np.mean([a.mean() for a in arrs]) + + results["prefill_ms"] = avg_last_n(timings["prefill"], num_iters) * 1000 + results["decode_forward_ms_per_step"] = avg_of_avg(timings["decode_forward"], num_iters) * 1000 + results["sampling_ms_per_step"] = avg_of_avg(timings["sampling"], num_iters) * 1000 + results["overhead_ms_per_step"] = avg_of_avg(timings["overhead"], num_iters) * 1000 + results["total_ms"] = avg_last_n(timings["total"], num_iters) * 1000 + results["decode_steps_total_ms"] = results["decode_forward_ms_per_step"] * max_new_tokens + results["sampling_total_ms"] = results["sampling_ms_per_step"] * max_new_tokens + results["overhead_total_ms"] = results["overhead_ms_per_step"] * max_new_tokens + + return results + + +def bench_hybrid_rollout(rollout, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10): + """Benchmark the full HybridEngineRollout.generate() path.""" + input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) + attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) + sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0) + request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask) + + times = [] + for _ in range(num_warmup + num_iters): + get_accelerator().synchronize() #ignore-cuda + t0 = time.perf_counter() + with torch.no_grad(): + rollout.generate(request, sampling) + get_accelerator().synchronize() #ignore-cuda + times.append(time.perf_counter() - t0) + + import numpy as np + avg = np.mean(times[-num_iters:]) * 1000 + return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct") + parser.add_argument("--prompt-len", type=int, default=64) + parser.add_argument("--max-new-tokens", type=int, default=64) + parser.add_argument("--num-warmup", type=int, default=3) + parser.add_argument("--num-iters", type=int, default=10) + args = parser.parse_args() + + device = get_accelerator().current_device() #ignore-cuda + + tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left") + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16).to(device) + + print(f"=== Raw decode loop benchmark (model={args.model}) ===") + raw = bench_decode_raw(model, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup, + args.num_iters) + print(f" Prefill: {raw['prefill_ms']:.2f} ms") + print( + f" Decode forward/step: {raw['decode_forward_ms_per_step']:.3f} ms (total: {raw['decode_steps_total_ms']:.1f} ms)" + ) + print(f" Sampling/step: {raw['sampling_ms_per_step']:.3f} ms (total: {raw['sampling_total_ms']:.1f} ms)") + print(f" Overhead/step: {raw['overhead_ms_per_step']:.3f} ms (total: {raw['overhead_total_ms']:.1f} ms)") + print(f" Total: {raw['total_ms']:.1f} ms") + + print(f"\n=== HybridEngineRollout benchmark ===") + rollout = HybridEngineRollout(model, tokenizer) + rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup, + args.num_iters) + print(f" Rollout generate: {rr['rollout_total_ms']:.1f} ms") + + print(f"\n=== Summary ===") + print(f" Raw decode loop: {raw['total_ms']:.1f} ms") + print(f" HybridEngine rollout: {rr['rollout_total_ms']:.1f} ms") + print(f" Overhead (rollout - raw): {rr['rollout_total_ms'] - raw['total_ms']:.1f} ms") + print( + f" Bottleneck: decode forward = {raw['decode_forward_ms_per_step']:.3f} ms/step x {args.max_new_tokens} steps = {raw['decode_steps_total_ms']:.1f} ms" + ) + + +if __name__ == "__main__": + main() diff --git a/training/opsd/benchmarks/bench_hybrid_tp.py b/training/opsd/benchmarks/bench_hybrid_tp.py new file mode 100644 index 000000000..3f41150c7 --- /dev/null +++ b/training/opsd/benchmarks/bench_hybrid_tp.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2). + +Usage: + deepspeed --num_gpus 2 bench_hybrid_tp.py \ + --model Qwen/Qwen2.5-14B-Instruct \ + --max-new-tokens 64 +""" + +import argparse +import os +import time + +import deepspeed +import numpy as np +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from deepspeed.accelerator import get_accelerator +from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout +from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig + + +def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters): + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + device = torch.device(f"cuda:{local_rank}") + + torch.manual_seed(42) + input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) + attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) + sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0) + request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask) + + times = [] + for i in range(num_warmup + num_iters): + get_accelerator().synchronize(device=device) #ignore-cuda + t0 = time.perf_counter() + with torch.no_grad(): + result = rollout.generate(request, sampling) + get_accelerator().synchronize(device=device) #ignore-cuda + elapsed = time.perf_counter() - t0 + times.append(elapsed) + if local_rank == 0: + label = "warmup" if i < num_warmup else "iter" + n_tokens = result.input_ids.shape[-1] - prompt_len + print(f" [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}") + + avg = np.mean(times[-num_iters:]) * 1000 + return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct") + parser.add_argument("--prompt-len", type=int, default=64) + parser.add_argument("--max-new-tokens", type=int, default=64) + parser.add_argument("--num-warmup", type=int, default=3) + parser.add_argument("--num-iters", type=int, default=10) + parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0))) + args = parser.parse_args() + + local_rank = args.local_rank + world_size = int(os.environ.get("WORLD_SIZE", "1")) + + deepspeed.init_distributed() + + if local_rank == 0: + print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===") + print(f" Model: {args.model}") + print(f" TP size: {world_size}") + print(f" Prompt len: {args.prompt_len}") + print(f" Decode len: {args.max_new_tokens}") + print(f" Warmup: {args.num_warmup}") + print(f" Iters: {args.num_iters}") + print() + + tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left") + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.bfloat16, + ) + + ds_config = { + "bf16": { + "enabled": True + }, + "zero_optimization": { + "stage": 0 + }, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": world_size, + "gradient_accumulation_steps": 1, + "tensor_parallel": { + "autotp_size": world_size, + "preset_model": "qwen2", + }, + } + + engine, *_ = deepspeed.initialize( + model=model, + optimizer=None, + model_parameters=model.parameters(), + config=ds_config, + ) + + if local_rank == 0: + print(" DeepSpeed engine initialized.") + param_count = sum(p.numel() for p in engine.parameters()) / 1e9 + alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda + print(f" Parameters (local): {param_count:.2f}B") + print(f" GPU mem allocated: {alloc:.1f} GB") + print() + + rollout = HybridEngineRollout(engine, tokenizer) + + if local_rank == 0: + print(" Running benchmark...") + + result = bench_hybrid_rollout( + rollout, + tokenizer, + args.prompt_len, + args.max_new_tokens, + args.num_warmup, + args.num_iters, + ) + + if local_rank == 0: + total = result["rollout_total_ms"] + per_step = total / args.max_new_tokens + throughput = 1000.0 / per_step + print() + print(f"=== Results ===") + print(f" Total generate: {total:.1f} ms") + print(f" Per decode step: {per_step:.2f} ms") + print(f" Throughput: {throughput:.1f} tokens/s") + + +if __name__ == "__main__": + main() diff --git a/training/opsd/benchmarks/bench_hybrid_tp_opt.py b/training/opsd/benchmarks/bench_hybrid_tp_opt.py new file mode 100644 index 000000000..d7fae2dde --- /dev/null +++ b/training/opsd/benchmarks/bench_hybrid_tp_opt.py @@ -0,0 +1,149 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer. + +Usage: + deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \ + --model Qwen/Qwen2.5-14B-Instruct \ + --max-new-tokens 64 +""" + +import argparse +import os +import time + +import deepspeed +import numpy as np +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from deepspeed.accelerator import get_accelerator +from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout +from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig + + +def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters): + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + device = torch.device(f"cuda:{local_rank}") + + torch.manual_seed(42) + input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) + attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) + sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0) + request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask) + + times = [] + for i in range(num_warmup + num_iters): + get_accelerator().synchronize(device=device) #ignore-cuda + t0 = time.perf_counter() + with torch.no_grad(): + result = rollout.generate(request, sampling) + get_accelerator().synchronize(device=device) #ignore-cuda + elapsed = time.perf_counter() - t0 + times.append(elapsed) + if local_rank == 0: + label = "warmup" if i < num_warmup else "iter" + n_tokens = result.input_ids.shape[-1] - prompt_len + print(f" [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}") + + avg = np.mean(times[-num_iters:]) * 1000 + return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct") + parser.add_argument("--prompt-len", type=int, default=64) + parser.add_argument("--max-new-tokens", type=int, default=64) + parser.add_argument("--num-warmup", type=int, default=3) + parser.add_argument("--num-iters", type=int, default=10) + parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0))) + args = parser.parse_args() + + local_rank = args.local_rank + world_size = int(os.environ.get("WORLD_SIZE", "1")) + + deepspeed.init_distributed() + + if local_rank == 0: + print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===") + print(f" Model: {args.model}") + print(f" TP size: {world_size}") + print(f" Prompt len: {args.prompt_len}") + print(f" Decode len: {args.max_new_tokens}") + print() + + tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left") + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.bfloat16, + ) + + ds_config = { + "bf16": { + "enabled": True + }, + "zero_optimization": { + "stage": 0 + }, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": world_size, + "gradient_accumulation_steps": 1, + "tensor_parallel": { + "autotp_size": world_size, + "preset_model": "qwen2", + }, + } + + engine, _, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config, + ) + + if local_rank == 0: + print(" DeepSpeed engine initialized (with optimizer).") + param_count = sum(p.numel() for p in engine.parameters()) / 1e9 + alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda + reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda + print(f" Parameters (local): {param_count:.2f}B") + alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda + reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda + print(f" GPU mem allocated: {alloc:.1f} GB") + print(f" GPU mem reserved: {reserv:.1f} GB") + print() + + rollout = HybridEngineRollout(engine, tokenizer) + + if local_rank == 0: + print(" Running benchmark...") + + result = bench_hybrid_rollout( + rollout, + tokenizer, + args.prompt_len, + args.max_new_tokens, + args.num_warmup, + args.num_iters, + ) + + if local_rank == 0: + total = result["rollout_total_ms"] + per_step = total / args.max_new_tokens + throughput = 1000.0 / per_step + print() + print(f"=== Results ===") + print(f" Total generate: {total:.1f} ms") + print(f" Per decode step: {per_step:.2f} ms") + print(f" Throughput: {throughput:.1f} tokens/s") + alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda + reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda + print(f" GPU mem (final): alloc={alloc:.1f} GB, reserved={reserv:.1f} GB") + + +if __name__ == "__main__": + main() diff --git a/training/opsd/config.py b/training/opsd/config.py new file mode 100644 index 000000000..66ff7e21e --- /dev/null +++ b/training/opsd/config.py @@ -0,0 +1,104 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""OPSD application configuration. + +``OPSDConfig`` is loaded from JSON and threads through the entire pipeline. +The ``rollout`` sub-config is consumed by DeepSpeed's rollout engine; the +rest is application-level (trainer, data, distillation). +""" + +import json +from dataclasses import dataclass, field +from typing import Optional + +from deepspeed.runtime.rollout import RolloutConfig + + +@dataclass +class StudentConfig: + model_name_or_path: str + dtype: str = "bfloat16" + trust_remote_code: bool = False + + +@dataclass +class TeacherConfig: + model_name_or_path: str + dtype: str = "bfloat16" + trust_remote_code: bool = False + offload_to_cpu: bool = True + + +@dataclass +class DistillationConfig: + # "forward_kl" | "reverse_kl" | "jsd" + loss_type: str = "reverse_kl" + temperature: float = 1.0 + chunk_size: int = 512 + + +@dataclass +class TrainingConfig: + train_batch_size: int = 8 + micro_batch_size_per_gpu: int = 1 + gradient_accumulation_steps: int = 1 + learning_rate: float = 1e-6 + weight_decay: float = 0.0 + num_train_epochs: int = 1 + max_steps: int = -1 + warmup_steps: int = 0 + save_steps: int = 500 + logging_steps: int = 10 + save_dir: str = "./opsd_ckpt" + seed: int = 42 + + +@dataclass +class DataConfig: + path: str = "" + prompt_field: str = "prompt" + chat_template: Optional[str] = None + shuffle: bool = True + + +@dataclass +class OPSDConfig: + student: StudentConfig + teacher: TeacherConfig + rollout: RolloutConfig = field(default_factory=RolloutConfig) + distillation: DistillationConfig = field(default_factory=DistillationConfig) + training: TrainingConfig = field(default_factory=TrainingConfig) + data: DataConfig = field(default_factory=DataConfig) + deepspeed_config: str = "" + + @classmethod + def from_json(cls, path: str) -> "OPSDConfig": + with open(path, "r") as f: + raw = json.load(f) + return cls.from_dict(raw) + + @classmethod + def from_dict(cls, raw: dict) -> "OPSDConfig": + return cls( + student=StudentConfig(**raw["student"]), + teacher=TeacherConfig(**raw["teacher"]), + rollout=RolloutConfig(**raw.get("rollout", {})), + distillation=DistillationConfig(**raw.get("distillation", {})), + training=TrainingConfig(**raw.get("training", {})), + data=DataConfig(**raw.get("data", {})), + deepspeed_config=raw.get("deepspeed_config", ""), + ) + + def to_dict(self) -> dict: + from dataclasses import asdict + return asdict(self) + + def validate(self) -> None: + if self.distillation.loss_type not in ("forward_kl", "reverse_kl", "jsd"): + raise ValueError(f"Unknown loss_type {self.distillation.loss_type!r}") + if self.rollout.engine != "hybrid_engine": + raise ValueError(f"Unknown rollout engine {self.rollout.engine!r}; expected 'hybrid_engine'") + if self.distillation.chunk_size <= 0: + raise ValueError("distillation.chunk_size must be positive") diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json deleted file mode 100644 index 63c1ecc03..000000000 --- a/training/opsd/configs/opsd_vllm_disjoint.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "student": { - "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", - "dtype": "bfloat16", - "trust_remote_code": false, - }, - "teacher": { - "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct", - "dtype": "bfloat16", - "trust_remote_code": false, - "offload_to_cpu": true - }, - "rollout": { - "engine": "vllm", - "max_prompt_length": 1024, - "max_response_length": 1024, - "temperature": 0, - "top_p": 1.0, - "top_k": -1, - "n_samples_per_prompt": 1, - "tensor_parallel_size": 2, - "gpu_memory_utilization": 0.85, - "engine_dtype": "bfloat16", - "weight_sync_interval": 4, - "vllm_min_version": "0.6.4", - "vllm_port": 8000 - }, - "distillation": { - "loss_type": "reverse_kl", - "temperature": 0, - "chunk_size": 512 - }, - "training": { - "train_batch_size": 1, - "micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": 1, - "learning_rate": 1e-6, - "weight_decay": 0.0, - "num_train_epochs": 1, - "max_steps": -1, - "warmup_steps": 0, - "save_steps": 500, - "logging_steps": 10, - "save_dir": "./opsd_ckpt_vllm", - "seed": 42 - }, - "data": { - "path": "data/prompts.jsonl", - "prompt_field": "prompt", - "shuffle": true - }, - "deepspeed_config": "configs/ds_zero3.json" -} diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json deleted file mode 100644 index bda234ca0..000000000 --- a/training/opsd/configs/smoke_vllm.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "student": { - "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", - "dtype": "bfloat16", - "trust_remote_code": false, - }, - "teacher": { - "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", - "dtype": "bfloat16", - "trust_remote_code": false, - "offload_to_cpu": false - }, - "rollout": { - "engine": "vllm", - "max_prompt_length": 128, - "max_response_length": 64, - "temperature": 0, - "top_p": 1.0, - "top_k": -1, - "n_samples_per_prompt": 1, - "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.3, - "engine_dtype": "bfloat16", - "weight_sync_interval": 2, - "vllm_min_version": "0.6.4", - "vllm_enforce_eager": true, - "vllm_port": 8000, - "vllm_python": "/root/miniconda3/envs/vllm/bin/python", - "weight_transfer_backend": "gdr" - }, - "distillation": { - "loss_type": "reverse_kl", - "temperature": 0, - "chunk_size": 128 - }, - "training": { - "train_batch_size": 1, - "micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": 1, - "learning_rate": 1e-6, - "weight_decay": 0.0, - "num_train_epochs": 1, - "max_steps": 5, - "warmup_steps": 0, - "save_steps": 10000, - "logging_steps": 1, - "save_dir": "./opsd_smoke_vllm_ckpt", - "seed": 42 - }, - "data": { - "path": "data/prompts.jsonl", - "prompt_field": "prompt", - "shuffle": true - }, - "deepspeed_config": "configs/smoke_ds_zero0.json" -} diff --git a/training/opsd/losses.py b/training/opsd/losses.py new file mode 100644 index 000000000..d9f4b9266 --- /dev/null +++ b/training/opsd/losses.py @@ -0,0 +1,192 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""Per-token distillation divergences with sequence-axis chunking. + +The full ``[B, T, V]`` tensor produced by a forward pass on a modern LLM can +easily exceed several GB in fp32 (e.g. 8 * 1024 * 150k * 4 B ~ 4.9 GB). Holding +both student *and* teacher logits at once would double that. We chunk along the +sequence axis so the per-chunk softmax + difference only ever needs +``[B, chunk, V]`` of working memory, regardless of T. + +Math conventions: + * ``forward_kl`` = D_KL(teacher || student) — mode-covering for student + * ``reverse_kl`` = D_KL(student || teacher) — mode-seeking for student + * ``jsd`` = 0.5 * D_KL(P || M) + 0.5 * D_KL(Q || M), M = (P+Q)/2 + +All three follow the standard knowledge-distillation temperature convention: +divide logits by T before softmax, then multiply the result by T**2 so that +gradient magnitudes are comparable across temperatures. +""" + +from typing import Callable + +import torch +import torch.nn.functional as F + + +def _forward_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor: + s_log_probs = F.log_softmax(student_logits / temperature, dim=-1) + t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1) + t_probs = t_log_probs.exp() + kl = (t_probs * (t_log_probs - s_log_probs)).sum(dim=-1) + return kl * (temperature**2) + + +def _reverse_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor: + s_log_probs = F.log_softmax(student_logits / temperature, dim=-1) + t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1) + s_probs = s_log_probs.exp() + kl = (s_probs * (s_log_probs - t_log_probs)).sum(dim=-1) + return kl * (temperature**2) + + +def _jsd(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor: + s_log_probs = F.log_softmax(student_logits / temperature, dim=-1) + t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1) + s_probs = s_log_probs.exp() + t_probs = t_log_probs.exp() + m_probs = 0.5 * (s_probs + t_probs) + # Clamp guards against log(0) when both distributions have ~0 mass on the + # same vocab id (rare in practice but possible after temperature scaling). + m_log_probs = m_probs.clamp_min(1e-12).log() + kl_s = (s_probs * (s_log_probs - m_log_probs)).sum(dim=-1) + kl_t = (t_probs * (t_log_probs - m_log_probs)).sum(dim=-1) + return 0.5 * (kl_s + kl_t) * (temperature**2) + + +_LOSS_FNS: "dict[str, Callable[..., torch.Tensor]]" = { + "forward_kl": _forward_kl, + "reverse_kl": _reverse_kl, + "jsd": _jsd, +} + + +def chunked_distillation_loss( + student_logits: torch.Tensor, + teacher_logits: torch.Tensor, + response_mask: torch.Tensor, + loss_type: str = "reverse_kl", + temperature: float = 1.0, + chunk_size: int = 512, +) -> torch.Tensor: + """Mean per-token divergence over response positions, chunked over the + sequence axis to bound peak memory. + + Args: + student_logits: ``[B, T, V]`` — gradient flows here. + teacher_logits: ``[B, T, V]`` — caller is responsible for ``detach()`` + (we do not detach here so the function stays cheap). + response_mask: ``[B, T]`` — 1 where the position should contribute to + the loss (i.e. response tokens, not prompt or padding), 0 elsewhere. + loss_type: ``"forward_kl"`` | ``"reverse_kl"`` | ``"jsd"``. + temperature: KD temperature; >1 softens both distributions. + chunk_size: Sequence-axis chunk size. + + Returns: + Scalar loss = sum-over-positions(per_tok * mask) / sum(mask), promoted + to fp32 internally for numerical stability. + """ + if loss_type not in _LOSS_FNS: + raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}") + fn = _LOSS_FNS[loss_type] + + if student_logits.shape != teacher_logits.shape: + raise ValueError(f"shape mismatch: student {tuple(student_logits.shape)} vs teacher " + f"{tuple(teacher_logits.shape)}") + B, T, _ = student_logits.shape + if response_mask.shape != (B, T): + raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits " + f"prefix ({B}, {T})") + + mask_f = response_mask.to(torch.float32) + total_tokens = mask_f.sum().clamp_min(1.0) + total_loss = student_logits.new_zeros((), dtype=torch.float32) + + for start in range(0, T, chunk_size): + end = min(start + chunk_size, T) + chunk_mask = mask_f[:, start:end] + # Skipping empty chunks avoids a redundant forward through the softmax + # path on chunks that wouldn't contribute anything to the sum. + if chunk_mask.sum().item() == 0: + continue + per_tok = fn( + student_logits[:, start:end].float(), + teacher_logits[:, start:end].float(), + temperature, + ) + total_loss = total_loss + (per_tok * chunk_mask).sum() + + return total_loss / total_tokens + + +def streamed_distillation_loss( + student_logits: torch.Tensor, + teacher_chunk_fetcher: Callable[[int, int], torch.Tensor], + response_mask: torch.Tensor, + loss_type: str = "reverse_kl", + temperature: float = 1.0, + chunk_size: int = 512, +) -> torch.Tensor: + """Same math as :func:`chunked_distillation_loss`, but teacher logits are + pulled chunk-by-chunk via a fetcher so the full ``[B, T, V]`` teacher + tensor never needs to live on the same device as the student. + + Args: + student_logits: ``[B, T, V]`` on the training device. + teacher_chunk_fetcher: ``fn(start, end) -> [B, end - start, V]``, already + on the same device and broadcastable dtype as ``student_logits``. + Typically wraps ``TeacherLogitCache.chunk_to_device``. + response_mask: ``[B, T]`` — 1 where the position should contribute. + loss_type: one of ``"forward_kl" | "reverse_kl" | "jsd"``. + temperature: KD temperature. + chunk_size: Sequence-axis chunk size. + """ + if loss_type not in _LOSS_FNS: + raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}") + fn = _LOSS_FNS[loss_type] + + B, T, _ = student_logits.shape + if response_mask.shape != (B, T): + raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits " + f"prefix ({B}, {T})") + + mask_f = response_mask.to(torch.float32) + total_tokens = mask_f.sum().clamp_min(1.0) + total_loss = student_logits.new_zeros((), dtype=torch.float32) + + for start in range(0, T, chunk_size): + end = min(start + chunk_size, T) + chunk_mask = mask_f[:, start:end] + if chunk_mask.sum().item() == 0: + continue + teacher_chunk = teacher_chunk_fetcher(start, end) + if teacher_chunk.shape[1] != (end - start): + raise RuntimeError(f"fetcher returned chunk of length {teacher_chunk.shape[1]}, " + f"expected {end - start}") + per_tok = fn( + student_logits[:, start:end].float(), + teacher_chunk.float(), + temperature, + ) + total_loss = total_loss + (per_tok * chunk_mask).sum() + + return total_loss / total_tokens + + +def per_token_logprobs(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + """Gather log p(label_t | context_ None: diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh deleted file mode 100644 index f39d659ec..000000000 --- a/training/opsd/scripts/train_opsd_vllm.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -# -# Launch OPSD training with vLLM rollout. -# -# The vLLM server is started **lazily** as a subprocess by training rank 0 -# on first use, so no separate vLLM launch step is required. The GPUs -# assigned to the vLLM server are controlled by the ROLLOUT_VISIBLE_DEVICE -# environment variable (comma-separated CUDA device indices). The training -# ranks must run on a *different* set of GPUs so the two don't contend for -# memory. -# -# Default topology: ranks 0..5 train on GPUs 0-5 (ZeRO-3), devices 6-7 -# run vLLM with TP=2. Override via: -# ROLLOUT_VISIBLE_DEVICE=... NUM_TRAIN_GPUS=.. INCLUDE_GPUS=.. bash ... -set -euo pipefail - -CONFIG="${1:-configs/opsd_vllm_disjoint.json}" -NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}" -INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}" -export ROLLOUT_VISIBLE_DEVICE="${ROLLOUT_VISIBLE_DEVICE:-6,7}" - -deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \ - main.py --config "${CONFIG}" diff --git a/training/opsd/tests/test_losses.py b/training/opsd/tests/test_losses.py index 41ea92289..2e28874be 100644 --- a/training/opsd/tests/test_losses.py +++ b/training/opsd/tests/test_losses.py @@ -13,8 +13,8 @@ import pytest import torch -from deepspeed.runtime.rlhf.losses import chunked_distillation_loss, per_token_logprobs -from deepspeed.runtime.rlhf.utils import build_response_mask, shift_for_next_token_prediction +from losses import chunked_distillation_loss, per_token_logprobs +from utils import build_response_mask, shift_for_next_token_prediction @pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"]) diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py index 36d2fcea8..090aa3ed7 100644 --- a/training/opsd/tests/test_teacher_caching.py +++ b/training/opsd/tests/test_teacher_caching.py @@ -85,7 +85,7 @@ def test_streamed_chunked_loss_matches_full_loss(): """End-to-end check: pulling teacher logits chunk-by-chunk through the cache yields the same distillation loss as passing the full teacher tensor to ``chunked_distillation_loss`` directly.""" - from deepspeed.runtime.rlhf.losses import chunked_distillation_loss + from losses import chunked_distillation_loss torch.manual_seed(0) s = torch.randn(2, 64, 32) diff --git a/training/opsd/trainer.py b/training/opsd/trainer.py new file mode 100644 index 000000000..34e60807c --- /dev/null +++ b/training/opsd/trainer.py @@ -0,0 +1,210 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""On-policy distillation (OPSD) training loop. + +Each step is three phases: + + 0. **Rollout.** The student generates responses for the batch's prompts + (via the configured :class:`~deepspeed.runtime.rollout.RolloutEngine`). + 1. **Teacher.** The frozen teacher runs a forward over prompt+response. The + full logit tensor is parked on the host via + :class:`~opsd.teacher.TeacherLogitCache` so teacher GPU buffers can be + released before the student backward. + 2. **Student.** The student runs forward+backward on prompt+response. The + loss is the per-token divergence to the teacher, streamed from the + host-resident cache one sequence chunk at a time + (:func:`~deepspeed.runtime.rlhf.losses.streamed_distillation_loss`), so + the full ``[B, T, V]`` teacher tensor never co-resides with the student + logits on the training device. + +The trainer itself contains no DeepSpeed-specific control flow beyond the +``backward`` / ``step`` calls on the student engine; backend choice (ZeRO +stage, offload, hybrid engine) is owned entirely by the DeepSpeed JSON config. +""" + +import os +import time +from abc import ABC, abstractmethod +from typing import Any + +import torch +from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator + +from config import OPSDConfig +from losses import streamed_distillation_loss +from utils import build_response_mask +from deepspeed.runtime.rollout import RolloutEngine, RolloutRequest, SamplingConfig + + +def _is_rank_zero() -> bool: + return (not dist.is_initialized()) or dist.get_rank() == 0 + + +class RLHFTrainer(ABC): + """Base class for RLHF training loops.""" + + @abstractmethod + def train(self) -> None: + ... + + @abstractmethod + def _train_step(self, batch: Any) -> dict: + ... + + +class OPSDTrainer(RLHFTrainer): + + def __init__( + self, + cfg: OPSDConfig, + student_engine: Any, + teacher: Any, + tokenizer: Any, + rollout: RolloutEngine, + dataloader: Any, + ): + self.cfg = cfg + self.student_engine = student_engine + self.teacher = teacher + self.tokenizer = tokenizer + self.rollout = rollout + self.dataloader = dataloader + + self.device = get_accelerator().current_device_name() + self.step = 0 + + # ------------------------------------------------------------------ + # Driver + # ------------------------------------------------------------------ + + def train(self) -> None: + max_steps = self.cfg.training.max_steps + for epoch in range(self.cfg.training.num_train_epochs): + for batch in self.dataloader: + if max_steps > 0 and self.step >= max_steps: + return + metrics = self._train_step(batch) + self._maybe_log(metrics) + self._maybe_save() + self.step += 1 + if max_steps > 0 and self.step >= max_steps: + return + + # ------------------------------------------------------------------ + # One step + # ------------------------------------------------------------------ + + def _train_step(self, batch) -> dict: + t_start = time.time() + + prompt_ids = batch["prompt_ids"].to(self.device, non_blocking=True) + prompt_attn = batch["prompt_attention_mask"].to(self.device, non_blocking=True) + + # Sync student weights into the rollout backend. + # No-op for hybrid engine; meaningful for vLLM. + self.rollout.sync_weights(self.step) + + # --- Phase 0: rollout (student generates responses) --------------- + # Switch hybrid engine to inference mode (gathers ZeRO-3 params). + self.student_engine.eval() + sampling = SamplingConfig( + max_new_tokens=self.cfg.rollout.max_response_length, + temperature=self.cfg.rollout.temperature, + top_p=self.cfg.rollout.top_p, + top_k=self.cfg.rollout.top_k, + n_samples_per_prompt=self.cfg.rollout.n_samples_per_prompt, + ) + roll = self.rollout.generate( + RolloutRequest(prompt_ids=prompt_ids, prompt_attention_mask=prompt_attn), + sampling, + ) + input_ids = roll.input_ids.to(self.device, non_blocking=True) + attention_mask = roll.attention_mask.to(self.device, non_blocking=True) + response_start_idx = roll.response_start_idx.to(self.device, non_blocking=True) + response_mask = build_response_mask(response_start_idx, attention_mask) + t_rollout = time.time() - t_start + + # --- Phase 1: teacher forward → host-cached logits ---------------- + t1 = time.time() + teacher_cache = self.teacher.forward_to_cache(input_ids, attention_mask) + t_teacher = time.time() - t1 + + # --- Phase 2: student forward + streamed KL + backward ------------ + t2 = time.time() + self.student_engine.train() + outputs = self.student_engine(input_ids=input_ids, attention_mask=attention_mask) + student_logits = outputs.logits # [B, T, V] + + # Shift for next-token prediction: logits at position t predict token + # at t+1, so the loss aligns student_logits[:, :-1] with the position + # t+1 entries of the response mask. + student_logits_shifted = student_logits[:, :-1, :] + mask_shifted = response_mask[:, 1:].contiguous() + + def _fetch(start: int, end: int) -> torch.Tensor: + # The cache holds *unshifted* teacher logits; for the next-token + # objective we ask the cache for positions [start, end) of the + # shifted teacher, which is positions [start, end) of the original + # since we already lopped off the final column in the student. + return teacher_cache.chunk_to_device(start, + end, + device=student_logits_shifted.device, + dtype=student_logits_shifted.dtype) + + loss = streamed_distillation_loss( + student_logits=student_logits_shifted, + teacher_chunk_fetcher=_fetch, + response_mask=mask_shifted, + loss_type=self.cfg.distillation.loss_type, + temperature=self.cfg.distillation.temperature, + chunk_size=self.cfg.distillation.chunk_size, + ) + + self.student_engine.backward(loss) + self.student_engine.step() + + teacher_cache.free() + t_student = time.time() - t2 + + # Reduce loss across ranks for a clean log line. + loss_for_log = loss.detach().clone() + if dist.is_initialized(): + dist.all_reduce(loss_for_log) + loss_for_log /= dist.get_world_size() + + return { + "loss": float(loss_for_log.item()), + "rollout_s": t_rollout, + "teacher_s": t_teacher, + "student_s": t_student, + "step_s": time.time() - t_start, + "response_tokens": int(mask_shifted.sum().item()), + } + + # ------------------------------------------------------------------ + # Logging / checkpointing + # ------------------------------------------------------------------ + + def _maybe_log(self, metrics: dict) -> None: + if self.step % self.cfg.training.logging_steps != 0: + return + if not _is_rank_zero(): + return + print(f"[opsd][step {self.step}] loss={metrics['loss']:.4f} " + f"rollout={metrics['rollout_s']:.2f}s teacher={metrics['teacher_s']:.2f}s " + f"student={metrics['student_s']:.2f}s step={metrics['step_s']:.2f}s " + f"resp_tok={metrics['response_tokens']}") + + def _maybe_save(self) -> None: + if self.step == 0: + return + if self.step % self.cfg.training.save_steps != 0: + return + tag = f"step_{self.step}" + os.makedirs(self.cfg.training.save_dir, exist_ok=True) + self.student_engine.save_checkpoint(self.cfg.training.save_dir, tag=tag) + if _is_rank_zero(): + print(f"[opsd] saved checkpoint to {self.cfg.training.save_dir}/{tag}") diff --git a/training/opsd/utils.py b/training/opsd/utils.py new file mode 100644 index 000000000..b2954407b --- /dev/null +++ b/training/opsd/utils.py @@ -0,0 +1,52 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""Small tensor/masking helpers shared by trainer, losses, and tests. + +These intentionally stay free of DeepSpeed / distributed imports so the +non-distributed unit tests can exercise them on CPU without a torchrun +launcher. +""" + +import torch + + +def build_response_mask(response_start_idx: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Mark positions belonging to the response (not prompt, not padding). + + Args: + response_start_idx: ``[B]`` int tensor — the first column index that is + part of the response, per sample. For *right-padded* prompts this + equals the prompt's token count; for the more common *left-padded* + convention used by causal generation it equals the prompt section + length (i.e. the column where prompt ends and response begins). + attention_mask: ``[B, T]`` — 1 on real tokens (prompt + response), 0 on + padding. + + Returns: + ``[B, T]`` 0/1 mask with the same dtype as ``attention_mask``. 1 only + at positions ``t >= response_start_idx[b]`` that are also attended. + """ + if response_start_idx.dim() != 1: + raise ValueError(f"response_start_idx must be 1-D, got shape {tuple(response_start_idx.shape)}") + if attention_mask.dim() != 2: + raise ValueError(f"attention_mask must be 2-D, got shape {tuple(attention_mask.shape)}") + B, T = attention_mask.shape + if response_start_idx.shape[0] != B: + raise ValueError(f"response_start_idx batch ({response_start_idx.shape[0]}) != " + f"attention_mask batch ({B})") + + pos = torch.arange(T, device=attention_mask.device).unsqueeze(0).expand(B, T) + is_response = pos >= response_start_idx.to(pos.dtype).unsqueeze(1) + return is_response.to(attention_mask.dtype) * attention_mask + + +def shift_for_next_token_prediction(logits: torch.Tensor, labels: torch.Tensor): + """Align logits at position t with the label at position t+1. + + Returns: + Tuple ``(shifted_logits[:, :-1, :], shifted_labels[:, 1:])`` — both + contiguous, so they can be safely indexed for the divergence loss. + """ + return logits[:, :-1, :].contiguous(), labels[:, 1:].contiguous() From 0e1c004c27c54b83de33ef713a59a3d18606c8ee Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 18:19:14 +0800 Subject: [PATCH 4/8] Move teacher.py and data.py from DeepSpeed to DeepSpeedExamples Signed-off-by: Guokai Ma --- training/opsd/data.py | 108 +++++++++++ training/opsd/main.py | 4 +- training/opsd/teacher.py | 191 ++++++++++++++++++++ training/opsd/tests/test_teacher_caching.py | 2 +- 4 files changed, 302 insertions(+), 3 deletions(-) create mode 100644 training/opsd/data.py create mode 100644 training/opsd/teacher.py diff --git a/training/opsd/data.py b/training/opsd/data.py new file mode 100644 index 000000000..8ce86b56c --- /dev/null +++ b/training/opsd/data.py @@ -0,0 +1,108 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""Prompt dataset and left-padding collator for OPSD rollouts. + +The dataset reads a JSONL file with one record per line; each record must +contain a string under :attr:`DataConfig.prompt_field` (default ``"prompt"``). +If the tokenizer exposes ``apply_chat_template``, single-turn prompts are +wrapped in a user-role message with ``add_generation_prompt=True`` so the +student generates the assistant turn. + +Batches are **left-padded** because causal generation requires real tokens at + the right edge — :class:`deepspeed.runtime.rollout.RolloutRequest` and the hybrid-engine +backend both assume this layout. +""" + +import json +from typing import Any, Dict, List, Optional + +import torch +from torch.utils.data import Dataset + + +class PromptDataset(Dataset): + """Reads ``{prompt_field: str}`` records from a JSONL file.""" + + def __init__( + self, + path: str, + tokenizer: Any, + max_prompt_length: int, + prompt_field: str = "prompt", + chat_template: Optional[str] = None, + ): + self.records = self._load_jsonl(path) + self.tokenizer = tokenizer + self.max_prompt_length = max_prompt_length + self.prompt_field = prompt_field + self.chat_template = chat_template + + @staticmethod + def _load_jsonl(path: str) -> List[Dict[str, Any]]: + records: List[Dict[str, Any]] = [] + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + + def __len__(self) -> int: + return len(self.records) + + def __getitem__(self, idx: int) -> str: + rec = self.records[idx] + if self.prompt_field not in rec: + raise KeyError(f"record {idx} missing field {self.prompt_field!r}") + text = rec[self.prompt_field] + + # If the tokenizer knows a chat template, render the prompt as a single + # user-role turn and request the generation prompt. This matches how + # instruction-tuned student/teacher checkpoints expect inputs. + if hasattr(self.tokenizer, "apply_chat_template"): + messages = [{"role": "user", "content": text}] if isinstance(text, str) else text + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=self.chat_template, + ) + return text + + +class LeftPaddedPromptCollator: + """Tokenizes a batch of prompt strings into a left-padded tensor batch.""" + + def __init__(self, tokenizer: Any, max_prompt_length: int): + self.tokenizer = tokenizer + self.max_prompt_length = max_prompt_length + self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id + if self.pad_id is None: + raise ValueError("tokenizer has neither pad_token_id nor eos_token_id; " + "cannot construct a padding collator") + + def __call__(self, batch_texts: List[str]) -> Dict[str, torch.Tensor]: + per_sample = [ + self.tokenizer( + t, + add_special_tokens=False, + truncation=True, + max_length=self.max_prompt_length, + return_tensors="pt", + )["input_ids"].squeeze(0) for t in batch_texts + ] + max_len = max(int(x.shape[0]) for x in per_sample) + B = len(per_sample) + + prompt_ids = torch.full((B, max_len), self.pad_id, dtype=torch.long) + attention_mask = torch.zeros((B, max_len), dtype=torch.long) + for i, ids in enumerate(per_sample): + n = int(ids.shape[0]) + # left-pad: real tokens at the right edge + prompt_ids[i, max_len - n:] = ids + attention_mask[i, max_len - n:] = 1 + + return {"prompt_ids": prompt_ids, "prompt_attention_mask": attention_mask} diff --git a/training/opsd/main.py b/training/opsd/main.py index 62298829a..91bf4a4d6 100644 --- a/training/opsd/main.py +++ b/training/opsd/main.py @@ -25,9 +25,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from config import OPSDConfig -from deepspeed.runtime.rlhf.data import LeftPaddedPromptCollator, PromptDataset +from data import LeftPaddedPromptCollator, PromptDataset from deepspeed.runtime.rollout import build_rollout -from deepspeed.runtime.rlhf.teacher import TeacherWrapper +from teacher import TeacherWrapper from trainer import OPSDTrainer diff --git a/training/opsd/teacher.py b/training/opsd/teacher.py new file mode 100644 index 000000000..1afaddd68 --- /dev/null +++ b/training/opsd/teacher.py @@ -0,0 +1,191 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +"""Frozen teacher: two-phase forward with CPU-cached logits. + +The trainer runs each step in two phases: + + 1. **Teacher phase.** Forward over the prompt+response. The full ``[B, T, V]`` + logit tensor is moved off the GPU into a :class:`TeacherLogitCache` so that + teacher weight buffers can be released before the student backward pass. + 2. **Student phase.** Forward + backward on the student. The distillation + loss pulls teacher logits back to GPU **one sequence chunk at a time** via + :meth:`TeacherLogitCache.chunk_to_device`, so peak GPU memory for teacher + data is only ``[B, chunk, V]``. + +This module deliberately lazy-imports ``deepspeed`` and ``transformers`` so +that the pure data-handling pieces (``TeacherLogitCache`` and the streamed +loss in :mod:`opsd.losses`) remain importable in CPU-only unit tests that do +not have a working DeepSpeed launcher. +""" + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch + +# ``opsd.config`` is pure-Python (no distributed imports), so we can import it +# at module load time without pulling in DeepSpeed. +from config import TeacherConfig + + +@dataclass +class TeacherLogitCache: + """CPU-resident teacher logits with on-demand chunk fetch. + + Stored in low precision (default ``bfloat16``) to halve host memory; the + consumer in :mod:`opsd.losses` promotes back to fp32 inside the divergence + so the KD math stays well-conditioned. + """ + + cpu_logits: torch.Tensor # [B, T, V] + + def __post_init__(self) -> None: + if self.cpu_logits.dim() != 3: + raise ValueError(f"cpu_logits must be 3-D [B, T, V]; got shape " + f"{tuple(self.cpu_logits.shape)}") + if self.cpu_logits.device.type != "cpu": + raise ValueError(f"cpu_logits must live on CPU; got device " + f"{self.cpu_logits.device}") + + @classmethod + def from_gpu_logits(cls, logits: torch.Tensor, store_dtype: torch.dtype = torch.bfloat16) -> "TeacherLogitCache": + """Detach + downcast + move to (pinned) host memory. + + ``non_blocking=True`` lets the copy overlap with the next CUDA op when + the destination is pinned; we try to pin and fall back silently if the + host doesn't support it (e.g. CPU-only test environments). + """ + downcast = logits.detach().to(dtype=store_dtype) + try: + host = torch.empty(downcast.shape, dtype=store_dtype, pin_memory=True) + host.copy_(downcast, non_blocking=True) + except RuntimeError: + host = downcast.cpu() + return cls(cpu_logits=host) + + @property + def shape(self) -> Tuple[int, int, int]: + s = self.cpu_logits.shape + return (int(s[0]), int(s[1]), int(s[2])) + + @property + def dtype(self) -> torch.dtype: + return self.cpu_logits.dtype + + def chunk_to_device(self, + start: int, + end: int, + device: torch.device, + dtype: Optional[torch.dtype] = None) -> torch.Tensor: + """Slice ``[:, start:end, :]`` and stage it on ``device``. + + ``dtype`` is the dtype on the destination; if ``None``, the stored + dtype is preserved. + """ + _, T, _ = self.shape + if not (0 <= start < end <= T): + raise ValueError(f"chunk bounds [{start}, {end}) invalid for T={T}") + chunk = self.cpu_logits[:, start:end] + out = chunk.to(device=device, dtype=dtype if dtype is not None else chunk.dtype, non_blocking=True) + return out + + def free(self) -> None: + """Drop the underlying buffer so a step's teacher cache can be GC'd + before the next teacher forward.""" + self.cpu_logits = torch.empty(0) + + +_DTYPE_MAP = { + "float16": torch.float16, + "fp16": torch.float16, + "bfloat16": torch.bfloat16, + "bf16": torch.bfloat16, + "float32": torch.float32, + "fp32": torch.float32, +} + + +def _resolve_dtype(name: str) -> torch.dtype: + if name not in _DTYPE_MAP: + raise ValueError(f"Unknown dtype {name!r}; choose from {sorted(_DTYPE_MAP)}") + return _DTYPE_MAP[name] + + +class TeacherWrapper: + """Frozen teacher. + + Two modes depending on ``cfg.offload_to_cpu``: + + * ``offload_to_cpu=False`` — load the teacher with HF's standard + ``from_pretrained`` and pin it on the local accelerator device. The + whole teacher lives in GPU memory; simplest path and what to use when + the teacher fits. + + * ``offload_to_cpu=True`` — wrap the loaded model with + ``deepspeed.initialize`` using a ZeRO-3 config with + ``offload_param.device='cpu'``. The optimizer slot is unused (no + trainable params) but ZeRO-3 gives us per-forward parameter gather + / release and keeps weights on the host between forwards. This is the + path to use when the teacher would otherwise not fit alongside the + student. + + Both paths load the full checkpoint on each rank before DeepSpeed (if + used) partitions; we intentionally do **not** wrap ``from_pretrained`` + in ``deepspeed.zero.Init()`` because HF's loader partitions + ``low_cpu_mem_usage`` params to zero-width shards before the checkpoint + can fill them, which surfaces as a "size mismatch" load error. + """ + + def __init__(self, cfg: TeacherConfig, world_size: int): + from deepspeed.accelerator import get_accelerator + from transformers import AutoModelForCausalLM + + self.cfg = cfg + dtype = _resolve_dtype(cfg.dtype) + device = get_accelerator().current_device_name() + + model = AutoModelForCausalLM.from_pretrained( + cfg.model_name_or_path, + torch_dtype=dtype, + trust_remote_code=cfg.trust_remote_code, + ) + model.eval() + for p in model.parameters(): + p.requires_grad_(False) + + if cfg.offload_to_cpu: + import deepspeed + + ds_config = { + "train_micro_batch_size_per_gpu": 1, + "bf16": { + "enabled": dtype is torch.bfloat16 + }, + "fp16": { + "enabled": dtype is torch.float16 + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu" + }, + }, + } + engine, *_ = deepspeed.initialize(model=model, config=ds_config) + self._callable = engine + self._uses_ds = True + else: + model.to(device) + self._callable = model + self._uses_ds = False + + @torch.no_grad() + def forward_to_cache(self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + store_dtype: torch.dtype = torch.bfloat16) -> TeacherLogitCache: + """Run teacher forward and stage logits onto the host.""" + outputs = self._callable(input_ids=input_ids, attention_mask=attention_mask) + return TeacherLogitCache.from_gpu_logits(outputs.logits, store_dtype=store_dtype) diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py index 090aa3ed7..f44bd6e7a 100644 --- a/training/opsd/tests/test_teacher_caching.py +++ b/training/opsd/tests/test_teacher_caching.py @@ -13,7 +13,7 @@ import pytest import torch -from deepspeed.runtime.rlhf.teacher import TeacherLogitCache +from teacher import TeacherLogitCache def test_round_trip_preserves_values_within_dtype(): From d0000be492af4e9529c79c106d79c10d18725f07 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 20:45:37 +0800 Subject: [PATCH 5/8] Extend RolloutConfig with app-level generation knobs; clean vLLM remnants from JSON - Subclass DeepSpeed's RolloutConfig to add temperature/top_p/etc - Remove weight_sync_interval from JSON configs (vLLM remnant) Signed-off-by: Guokai Ma --- training/opsd/config.py | 13 ++++++++++++- training/opsd/configs/opsd_hybrid_engine.json | 1 - training/opsd/configs/smoke_hybrid.json | 1 - training/opsd/configs/smoke_hybrid_gc.json | 1 - 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/training/opsd/config.py b/training/opsd/config.py index 66ff7e21e..1ccb9e9c2 100644 --- a/training/opsd/config.py +++ b/training/opsd/config.py @@ -13,7 +13,18 @@ from dataclasses import dataclass, field from typing import Optional -from deepspeed.runtime.rollout import RolloutConfig +from deepspeed.runtime.rollout import RolloutConfig as _BaseRolloutConfig + + +@dataclass +class RolloutConfig(_BaseRolloutConfig): + """Extends DeepSpeed's RolloutConfig with OPSD generation knobs.""" + max_prompt_length: int = 1024 + max_response_length: int = 1024 + temperature: float = 0.0 + top_p: float = 1.0 + top_k: int = -1 + n_samples_per_prompt: int = 1 @dataclass diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json index d2ebb8b03..f83503c27 100644 --- a/training/opsd/configs/opsd_hybrid_engine.json +++ b/training/opsd/configs/opsd_hybrid_engine.json @@ -18,7 +18,6 @@ "top_p": 1.0, "top_k": -1, "n_samples_per_prompt": 1, - "weight_sync_interval": 1 }, "distillation": { "loss_type": "reverse_kl", diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json index 774092926..51d80feea 100644 --- a/training/opsd/configs/smoke_hybrid.json +++ b/training/opsd/configs/smoke_hybrid.json @@ -18,7 +18,6 @@ "top_p": 1.0, "top_k": -1, "n_samples_per_prompt": 1, - "weight_sync_interval": 1 }, "distillation": { "loss_type": "reverse_kl", diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json index 0512c1581..8c563b199 100644 --- a/training/opsd/configs/smoke_hybrid_gc.json +++ b/training/opsd/configs/smoke_hybrid_gc.json @@ -19,7 +19,6 @@ "top_k": -1, "n_samples_per_prompt": 1, "use_graph_capture": true, - "weight_sync_interval": 1 }, "distillation": { "loss_type": "reverse_kl", From d3eda203def138f8dfd786277e2641ea74482bf5 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 21:57:46 +0800 Subject: [PATCH 6/8] Keep only bench_decode_1p1r; add --graph-capture flag and engine wrapper fix - Remove 14B/multi-GPU benchmarks (bench_14b_rollout, bench_autotp_gc, bench_hybrid_tp, bench_hybrid_tp_opt) - Fix bench_decode_1p1r: wrap model for HybridEngineRollout - Add --graph-capture CLI flag Signed-off-by: Guokai Ma --- training/opsd/benchmarks/bench_14b_rollout.py | 134 ---------------- training/opsd/benchmarks/bench_autotp_gc.py | 96 ----------- training/opsd/benchmarks/bench_decode_1p1r.py | 8 +- training/opsd/benchmarks/bench_hybrid_tp.py | 145 ----------------- .../opsd/benchmarks/bench_hybrid_tp_opt.py | 149 ------------------ 5 files changed, 6 insertions(+), 526 deletions(-) delete mode 100644 training/opsd/benchmarks/bench_14b_rollout.py delete mode 100644 training/opsd/benchmarks/bench_autotp_gc.py delete mode 100644 training/opsd/benchmarks/bench_hybrid_tp.py delete mode 100644 training/opsd/benchmarks/bench_hybrid_tp_opt.py diff --git a/training/opsd/benchmarks/bench_14b_rollout.py b/training/opsd/benchmarks/bench_14b_rollout.py deleted file mode 100644 index d66c7615d..000000000 --- a/training/opsd/benchmarks/bench_14b_rollout.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC.""" -import time -import os -import sys -import torch -import deepspeed -from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig -from transformers import AutoModelForCausalLM, AutoTokenizer - -MODEL = "Qwen/Qwen2.5-14B-Instruct" -MAX_NEW_TOKENS = 256 -N_SAMPLES = 1 -CB_SIZE = 1 -N_RUNS = 5 -PROMPT = "def fibonacci(n):" - - -def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label): - rank = torch.distributed.get_rank() - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - device = torch.device(f"cuda:{local_rank}") - - rollout = HybridEngineRollout( - engine=engine, - tokenizer=tokenizer, - continuous_batching_size=cb_size, - use_graph_capture=use_graph_capture, - ) - - ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device) - req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) - sampling = SamplingConfig( - max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95, - n_samples_per_prompt=N_SAMPLES - ) - - # Warmup - torch.manual_seed(42) - engine.eval() - rollout.generate(req, sampling) - engine.train() - - # Benchmark - times = [] - total_toks = 0 - for i in range(N_RUNS): - torch.manual_seed(42 + i) - engine.eval() - torch.cuda.synchronize() - t0 = time.time() - batch = rollout.generate(req, sampling) - torch.cuda.synchronize() - times.append(time.time() - t0) - engine.train() - - # Count tokens from last run - pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id - for i in range(batch.input_ids.shape[0]): - resp = batch.input_ids[i, batch.response_start_idx[i]:] - total_toks += (resp != pad_id).sum().item() - - t_avg = sum(times[1:]) / len(times[1:]) - - if rank == 0: - print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s " - f"runs={[f'{t*1000:.0f}' for t in times]}") - - return total_toks, t_avg - - -def main(): - deepspeed.init_distributed() - rank = torch.distributed.get_rank() - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - torch.cuda.set_device(local_rank) - - world_size = torch.distributed.get_world_size() - tp_size = world_size # all GPUs used for TP - - tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True) - - ds_config = { - "bf16": {"enabled": True}, - "zero_optimization": {"stage": 0}, - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": world_size, - "gradient_accumulation_steps": 1, - "hybrid_engine": { - "enabled": True, - "max_out_tokens": 512, - "inference_tp_size": 1, - "release_inference_cache": False, - "pin_parameters": True, - "tp_gather_partition_size": 8, - }, - } - - if tp_size > 1: - ds_config["tensor_parallel"] = { - "autotp_size": tp_size, - "preset_model": "qwen2", - "tp": {"tp_size": tp_size}, - } - - engine, *_ = deepspeed.initialize(model=model, config=ds_config) - - if rank == 0: - print(f"\n{'='*60}") - print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}") - print(f"{'='*60}") - - # 1P1R without graph capture (CB=1, no GC) - try: - bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}") - except Exception as e: - if rank == 0: - print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}") - import traceback; traceback.print_exc() - - # 1P1R with CUDA graph capture - try: - bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC") - except Exception as e: - if rank == 0: - print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}") - import traceback; traceback.print_exc() - - if rank == 0: - print(f"{'='*60}\n") - - -if __name__ == "__main__": - main() diff --git a/training/opsd/benchmarks/bench_autotp_gc.py b/training/opsd/benchmarks/bench_autotp_gc.py deleted file mode 100644 index c9a245b24..000000000 --- a/training/opsd/benchmarks/bench_autotp_gc.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Benchmark rollout with AutoTP + graph capture on 14B model.""" -import time -import torch -import deepspeed -from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig -from transformers import AutoModelForCausalLM, AutoTokenizer - -def main(): - deepspeed.init_distributed() - rank = torch.distributed.get_rank() - local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count() - torch.cuda.set_device(local_rank) - device = torch.device(f"cuda:{local_rank}") - - model_name = "Qwen/Qwen2.5-14B-Instruct" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - - model = AutoModelForCausalLM.from_pretrained( - model_name, dtype=torch.bfloat16, trust_remote_code=True - ) - - ds_config = { - "bf16": {"enabled": True}, - "zero_optimization": {"stage": 0}, - "tensor_parallel": { - "autotp_size": 2, - "preset_model": "qwen2", - "tp": {"tp_size": 2}, - }, - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2, - "gradient_accumulation_steps": 1, - "hybrid_engine": { - "enabled": True, - "max_out_tokens": 512, - "inference_tp_size": 1, - "release_inference_cache": False, - "pin_parameters": True, - "tp_gather_partition_size": 8, - }, - } - - engine, *_ = deepspeed.initialize(model=model, config=ds_config) - - rollout = HybridEngineRollout( - engine=engine, - tokenizer=tokenizer, - continuous_batching_size=2, - use_graph_capture=True, - ) - - # Prepare prompt - prompt = "def fibonacci(n):" - ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) - req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) - sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4) - - # Warmup - torch.manual_seed(42) - engine.eval() - rollout.generate(req, sampling) - engine.train() - - # Benchmark - times = [] - for i in range(5): - torch.manual_seed(42) - engine.eval() - torch.cuda.synchronize() - t0 = time.time() - batch = rollout.generate(req, sampling) - torch.cuda.synchronize() - times.append(time.time() - t0) - engine.train() - - t_avg = sum(times[1:]) / len(times[1:]) - # Count tokens - pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id - total_toks = 0 - for i in range(batch.input_ids.shape[0]): - resp = batch.input_ids[i, batch.response_start_idx[i]:] - total_toks += (resp != pad_id).sum().item() - - if rank == 0: - print(f"\n{'='*60}") - print(f"Model: {model_name}") - print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256") - print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms") - print(f"Total response tokens: {total_toks}") - print(f"Throughput: {total_toks/t_avg:.1f} tok/s") - print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}") - print(f"{'='*60}\n") - - -if __name__ == "__main__": - main() diff --git a/training/opsd/benchmarks/bench_decode_1p1r.py b/training/opsd/benchmarks/bench_decode_1p1r.py index 58fb667d4..645a5fbd1 100644 --- a/training/opsd/benchmarks/bench_decode_1p1r.py +++ b/training/opsd/benchmarks/bench_decode_1p1r.py @@ -140,6 +140,7 @@ def main(): parser.add_argument("--max-new-tokens", type=int, default=64) parser.add_argument("--num-warmup", type=int, default=3) parser.add_argument("--num-iters", type=int, default=10) + parser.add_argument("--graph-capture", action="store_true", help="Enable CUDA graph capture") args = parser.parse_args() device = get_accelerator().current_device() #ignore-cuda @@ -161,8 +162,11 @@ def main(): print(f" Overhead/step: {raw['overhead_ms_per_step']:.3f} ms (total: {raw['overhead_total_ms']:.1f} ms)") print(f" Total: {raw['total_ms']:.1f} ms") - print(f"\n=== HybridEngineRollout benchmark ===") - rollout = HybridEngineRollout(model, tokenizer) + print(f"\n=== HybridEngineRollout benchmark (graph_capture={args.graph_capture}) ===") + engine = type('Engine', (), {'module': model})() # lightweight wrapper + from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig + cfg = HybridEngineRolloutConfig(use_graph_capture=args.graph_capture) + rollout = HybridEngineRollout(engine, tokenizer, cfg=cfg) rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup, args.num_iters) print(f" Rollout generate: {rr['rollout_total_ms']:.1f} ms") diff --git a/training/opsd/benchmarks/bench_hybrid_tp.py b/training/opsd/benchmarks/bench_hybrid_tp.py deleted file mode 100644 index 3f41150c7..000000000 --- a/training/opsd/benchmarks/bench_hybrid_tp.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 -# DeepSpeed Team -"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2). - -Usage: - deepspeed --num_gpus 2 bench_hybrid_tp.py \ - --model Qwen/Qwen2.5-14B-Instruct \ - --max-new-tokens 64 -""" - -import argparse -import os -import time - -import deepspeed -import numpy as np -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout -from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig - - -def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters): - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - device = torch.device(f"cuda:{local_rank}") - - torch.manual_seed(42) - input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) - attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) - sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0) - request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask) - - times = [] - for i in range(num_warmup + num_iters): - get_accelerator().synchronize(device=device) #ignore-cuda - t0 = time.perf_counter() - with torch.no_grad(): - result = rollout.generate(request, sampling) - get_accelerator().synchronize(device=device) #ignore-cuda - elapsed = time.perf_counter() - t0 - times.append(elapsed) - if local_rank == 0: - label = "warmup" if i < num_warmup else "iter" - n_tokens = result.input_ids.shape[-1] - prompt_len - print(f" [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}") - - avg = np.mean(times[-num_iters:]) * 1000 - return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens} - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct") - parser.add_argument("--prompt-len", type=int, default=64) - parser.add_argument("--max-new-tokens", type=int, default=64) - parser.add_argument("--num-warmup", type=int, default=3) - parser.add_argument("--num-iters", type=int, default=10) - parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0))) - args = parser.parse_args() - - local_rank = args.local_rank - world_size = int(os.environ.get("WORLD_SIZE", "1")) - - deepspeed.init_distributed() - - if local_rank == 0: - print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===") - print(f" Model: {args.model}") - print(f" TP size: {world_size}") - print(f" Prompt len: {args.prompt_len}") - print(f" Decode len: {args.max_new_tokens}") - print(f" Warmup: {args.num_warmup}") - print(f" Iters: {args.num_iters}") - print() - - tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left") - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - model = AutoModelForCausalLM.from_pretrained( - args.model, - torch_dtype=torch.bfloat16, - ) - - ds_config = { - "bf16": { - "enabled": True - }, - "zero_optimization": { - "stage": 0 - }, - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": world_size, - "gradient_accumulation_steps": 1, - "tensor_parallel": { - "autotp_size": world_size, - "preset_model": "qwen2", - }, - } - - engine, *_ = deepspeed.initialize( - model=model, - optimizer=None, - model_parameters=model.parameters(), - config=ds_config, - ) - - if local_rank == 0: - print(" DeepSpeed engine initialized.") - param_count = sum(p.numel() for p in engine.parameters()) / 1e9 - alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda - print(f" Parameters (local): {param_count:.2f}B") - print(f" GPU mem allocated: {alloc:.1f} GB") - print() - - rollout = HybridEngineRollout(engine, tokenizer) - - if local_rank == 0: - print(" Running benchmark...") - - result = bench_hybrid_rollout( - rollout, - tokenizer, - args.prompt_len, - args.max_new_tokens, - args.num_warmup, - args.num_iters, - ) - - if local_rank == 0: - total = result["rollout_total_ms"] - per_step = total / args.max_new_tokens - throughput = 1000.0 / per_step - print() - print(f"=== Results ===") - print(f" Total generate: {total:.1f} ms") - print(f" Per decode step: {per_step:.2f} ms") - print(f" Throughput: {throughput:.1f} tokens/s") - - -if __name__ == "__main__": - main() diff --git a/training/opsd/benchmarks/bench_hybrid_tp_opt.py b/training/opsd/benchmarks/bench_hybrid_tp_opt.py deleted file mode 100644 index d7fae2dde..000000000 --- a/training/opsd/benchmarks/bench_hybrid_tp_opt.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 -# DeepSpeed Team -"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer. - -Usage: - deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \ - --model Qwen/Qwen2.5-14B-Instruct \ - --max-new-tokens 64 -""" - -import argparse -import os -import time - -import deepspeed -import numpy as np -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout -from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig - - -def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters): - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - device = torch.device(f"cuda:{local_rank}") - - torch.manual_seed(42) - input_ids = torch.randint(10, 1000, (1, prompt_len), device=device) - attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device) - sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0) - request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask) - - times = [] - for i in range(num_warmup + num_iters): - get_accelerator().synchronize(device=device) #ignore-cuda - t0 = time.perf_counter() - with torch.no_grad(): - result = rollout.generate(request, sampling) - get_accelerator().synchronize(device=device) #ignore-cuda - elapsed = time.perf_counter() - t0 - times.append(elapsed) - if local_rank == 0: - label = "warmup" if i < num_warmup else "iter" - n_tokens = result.input_ids.shape[-1] - prompt_len - print(f" [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}") - - avg = np.mean(times[-num_iters:]) * 1000 - return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens} - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct") - parser.add_argument("--prompt-len", type=int, default=64) - parser.add_argument("--max-new-tokens", type=int, default=64) - parser.add_argument("--num-warmup", type=int, default=3) - parser.add_argument("--num-iters", type=int, default=10) - parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0))) - args = parser.parse_args() - - local_rank = args.local_rank - world_size = int(os.environ.get("WORLD_SIZE", "1")) - - deepspeed.init_distributed() - - if local_rank == 0: - print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===") - print(f" Model: {args.model}") - print(f" TP size: {world_size}") - print(f" Prompt len: {args.prompt_len}") - print(f" Decode len: {args.max_new_tokens}") - print() - - tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left") - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - model = AutoModelForCausalLM.from_pretrained( - args.model, - torch_dtype=torch.bfloat16, - ) - - ds_config = { - "bf16": { - "enabled": True - }, - "zero_optimization": { - "stage": 0 - }, - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": world_size, - "gradient_accumulation_steps": 1, - "tensor_parallel": { - "autotp_size": world_size, - "preset_model": "qwen2", - }, - } - - engine, _, _, _ = deepspeed.initialize( - model=model, - model_parameters=model.parameters(), - config=ds_config, - ) - - if local_rank == 0: - print(" DeepSpeed engine initialized (with optimizer).") - param_count = sum(p.numel() for p in engine.parameters()) / 1e9 - alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda - reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda - print(f" Parameters (local): {param_count:.2f}B") - alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda - reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda - print(f" GPU mem allocated: {alloc:.1f} GB") - print(f" GPU mem reserved: {reserv:.1f} GB") - print() - - rollout = HybridEngineRollout(engine, tokenizer) - - if local_rank == 0: - print(" Running benchmark...") - - result = bench_hybrid_rollout( - rollout, - tokenizer, - args.prompt_len, - args.max_new_tokens, - args.num_warmup, - args.num_iters, - ) - - if local_rank == 0: - total = result["rollout_total_ms"] - per_step = total / args.max_new_tokens - throughput = 1000.0 / per_step - print() - print(f"=== Results ===") - print(f" Total generate: {total:.1f} ms") - print(f" Per decode step: {per_step:.2f} ms") - print(f" Throughput: {throughput:.1f} tokens/s") - alloc = get_accelerator().memory_allocated(local_rank) / 1e9 #ignore-cuda - reserv = get_accelerator().memory_reserved(local_rank) / 1e9 #ignore-cuda - print(f" GPU mem (final): alloc={alloc:.1f} GB, reserved={reserv:.1f} GB") - - -if __name__ == "__main__": - main() From 8bf134e1d841ac4b931e77ca034b2e7750abd534 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 23:06:41 +0800 Subject: [PATCH 7/8] Fix distillation temperature from 0 to 1.0 in smoke and production configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit temperature=0 causes logits/0 = inf → NaN loss. The correct default for knowledge distillation is temperature=1.0 (standard softmax). Signed-off-by: Guokai Ma --- training/opsd/configs/opsd_hybrid_engine.json | 2 +- training/opsd/configs/smoke_hybrid.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json index f83503c27..feeef50c9 100644 --- a/training/opsd/configs/opsd_hybrid_engine.json +++ b/training/opsd/configs/opsd_hybrid_engine.json @@ -21,7 +21,7 @@ }, "distillation": { "loss_type": "reverse_kl", - "temperature": 0, + "temperature": 1.0, "chunk_size": 512 }, "training": { diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json index 51d80feea..c936694b8 100644 --- a/training/opsd/configs/smoke_hybrid.json +++ b/training/opsd/configs/smoke_hybrid.json @@ -21,7 +21,7 @@ }, "distillation": { "loss_type": "reverse_kl", - "temperature": 0, + "temperature": 1.0, "chunk_size": 128 }, "training": { From 5d1d2e54b0dc53d4df9fe5709bf64a865d3a1a78 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 3 Jul 2026 23:18:24 +0800 Subject: [PATCH 8/8] Fix trailing commas in JSON configs; remove unused smoke_ds_zero3.json Signed-off-by: Guokai Ma --- training/opsd/configs/opsd_hybrid_engine.json | 8 ++--- training/opsd/configs/smoke_ds_zero3.json | 35 ------------------- training/opsd/configs/smoke_hybrid.json | 8 ++--- training/opsd/configs/smoke_hybrid_gc.json | 8 ++--- 4 files changed, 12 insertions(+), 47 deletions(-) delete mode 100644 training/opsd/configs/smoke_ds_zero3.json diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json index feeef50c9..3478a1fa1 100644 --- a/training/opsd/configs/opsd_hybrid_engine.json +++ b/training/opsd/configs/opsd_hybrid_engine.json @@ -2,7 +2,7 @@ "student": { "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "dtype": "bfloat16", - "trust_remote_code": false, + "trust_remote_code": false }, "teacher": { "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct", @@ -17,7 +17,7 @@ "temperature": 0, "top_p": 1.0, "top_k": -1, - "n_samples_per_prompt": 1, + "n_samples_per_prompt": 1 }, "distillation": { "loss_type": "reverse_kl", @@ -28,7 +28,7 @@ "train_batch_size": 1, "micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, - "learning_rate": 1e-6, + "learning_rate": 1e-06, "weight_decay": 0.0, "num_train_epochs": 1, "max_steps": -1, @@ -44,4 +44,4 @@ "shuffle": true }, "deepspeed_config": "configs/ds_zero3.json" -} +} \ No newline at end of file diff --git a/training/opsd/configs/smoke_ds_zero3.json b/training/opsd/configs/smoke_ds_zero3.json deleted file mode 100644 index 74211f3fb..000000000 --- a/training/opsd/configs/smoke_ds_zero3.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "bf16": { - "enabled": true - }, - "zero_optimization": { - "stage": 3, - "overlap_comm": true, - "contiguous_gradients": true, - "reduce_bucket_size": 5e7, - "stage3_prefetch_bucket_size": 5e7, - "stage3_param_persistence_threshold": 1e6, - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_16bit_weights_on_model_save": true - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 1e-6, - "betas": [0.9, 0.95], - "eps": 1e-8, - "weight_decay": 0.0 - } - }, - "gradient_clipping": 1.0, - "hybrid_engine": { - "enabled": true, - "max_out_tokens": 512, - "inference_tp_size": 1, - "release_inference_cache": false, - "pin_parameters": true, - "tp_gather_partition_size": 8 - }, - "wall_clock_breakdown": false -} diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json index c936694b8..250214ddc 100644 --- a/training/opsd/configs/smoke_hybrid.json +++ b/training/opsd/configs/smoke_hybrid.json @@ -2,7 +2,7 @@ "student": { "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "dtype": "bfloat16", - "trust_remote_code": false, + "trust_remote_code": false }, "teacher": { "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", @@ -17,7 +17,7 @@ "temperature": 0, "top_p": 1.0, "top_k": -1, - "n_samples_per_prompt": 1, + "n_samples_per_prompt": 1 }, "distillation": { "loss_type": "reverse_kl", @@ -28,7 +28,7 @@ "train_batch_size": 1, "micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, - "learning_rate": 1e-6, + "learning_rate": 1e-06, "weight_decay": 0.0, "num_train_epochs": 1, "max_steps": 5, @@ -44,4 +44,4 @@ "shuffle": true }, "deepspeed_config": "configs/smoke_ds_zero0.json" -} +} \ No newline at end of file diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json index 8c563b199..e32d070a9 100644 --- a/training/opsd/configs/smoke_hybrid_gc.json +++ b/training/opsd/configs/smoke_hybrid_gc.json @@ -2,7 +2,7 @@ "student": { "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "dtype": "bfloat16", - "trust_remote_code": false, + "trust_remote_code": false }, "teacher": { "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct", @@ -18,7 +18,7 @@ "top_p": 1.0, "top_k": -1, "n_samples_per_prompt": 1, - "use_graph_capture": true, + "use_graph_capture": true }, "distillation": { "loss_type": "reverse_kl", @@ -29,7 +29,7 @@ "train_batch_size": 1, "micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, - "learning_rate": 1e-6, + "learning_rate": 1e-06, "weight_decay": 0.0, "num_train_epochs": 1, "max_steps": 5, @@ -45,4 +45,4 @@ "shuffle": true }, "deepspeed_config": "configs/smoke_ds_zero0.json" -} +} \ No newline at end of file