From 14651ed645735860828f23af6f06593c1a37f58c Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Wed, 24 Jun 2026 15:51:48 +0800
Subject: [PATCH 1/8] Add OPSD (On-Policy Distillation) training example

Entry point, configs, data, and tests for on-policy distillation
using DeepSpeed's hybrid engine rollout and vLLM backend.

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/README.md                       | 222 ++++++++++++++++
 training/opsd/configs/ds_zero3.json           |  43 ++++
 training/opsd/configs/opsd_hybrid_engine.json |  48 ++++
 training/opsd/configs/opsd_vllm_disjoint.json |  54 ++++
 training/opsd/configs/smoke_ds_zero0.json     |  20 ++
 training/opsd/configs/smoke_ds_zero3.json     |  35 +++
 training/opsd/configs/smoke_hybrid.json       |  48 ++++
 training/opsd/configs/smoke_hybrid_gc.json    |  49 ++++
 training/opsd/configs/smoke_vllm.json         |  57 +++++
 training/opsd/data/prompts.jsonl              | 238 ++++++++++++++++++
 training/opsd/main.py                         | 134 ++++++++++
 training/opsd/requirements.txt                |   5 +
 training/opsd/scripts/train_opsd_hybrid.sh    |  14 ++
 training/opsd/scripts/train_opsd_vllm.sh      |  24 ++
 training/opsd/tests/test_losses.py            | 166 ++++++++++++
 training/opsd/tests/test_teacher_caching.py   | 101 ++++++++
 16 files changed, 1258 insertions(+)
 create mode 100644 training/opsd/README.md
 create mode 100644 training/opsd/configs/ds_zero3.json
 create mode 100644 training/opsd/configs/opsd_hybrid_engine.json
 create mode 100644 training/opsd/configs/opsd_vllm_disjoint.json
 create mode 100644 training/opsd/configs/smoke_ds_zero0.json
 create mode 100644 training/opsd/configs/smoke_ds_zero3.json
 create mode 100644 training/opsd/configs/smoke_hybrid.json
 create mode 100644 training/opsd/configs/smoke_hybrid_gc.json
 create mode 100644 training/opsd/configs/smoke_vllm.json
 create mode 100644 training/opsd/data/prompts.jsonl
 create mode 100644 training/opsd/main.py
 create mode 100644 training/opsd/requirements.txt
 create mode 100644 training/opsd/scripts/train_opsd_hybrid.sh
 create mode 100644 training/opsd/scripts/train_opsd_vllm.sh
 create mode 100644 training/opsd/tests/test_losses.py
 create mode 100644 training/opsd/tests/test_teacher_caching.py

diff --git a/training/opsd/README.md b/training/opsd/README.md
new file mode 100644
index 000000000..3fce93c36
--- /dev/null
+++ b/training/opsd/README.md
@@ -0,0 +1,222 @@
+# On-Policy Distillation (OPSD) on DeepSpeed
+
+A DeepSpeed-native port of [HJSang/OPSD_OnPolicyDistillation](https://github.com/HJSang/OPSD_OnPolicyDistillation),
+removing the verl dependency and building directly on DeepSpeed primitives
+(ZeRO-3, hybrid engine, `deepspeed.initialize`).
+
+On-policy distillation trains a small **student** model to imitate a large
+frozen **teacher** on the student's *own* generated rollouts. Each training
+step has three phases:
+
+```
+┌────────────┐   prompts   ┌──────────────────┐   prompt+response   ┌────────────┐
+│ Dataloader │ ──────────▶ │ Student rollout  │ ──────────────────▶ │  Teacher   │
+└────────────┘             │ (hybrid / vLLM)  │                     │  forward   │
+                           └──────────────────┘                     └─────┬──────┘
+                                                                          │ logits → CPU cache
+                                                                          ▼
+                                                              ┌─────────────────────┐
+                                                              │ Student forward +   │
+                                                              │ streamed KL / JSD + │
+                                                              │ backward / step     │
+                                                              └─────────────────────┘
+```
+
+Loss = per-token divergence (`forward_kl` | `reverse_kl` | `jsd`) between
+student and teacher distributions on the student's generated tokens, chunked
+over the sequence axis so the full `[B, T, V]` teacher tensor never
+co-resides with the student logits on the training device.
+
+## Layout
+
+```
+examples/opsd/
+├── main.py                            # entry point (deepspeed launcher)
+├── opsd/
+│   ├── config.py                      # OPSDConfig dataclass + JSON loader
+│   ├── losses.py                      # chunked / streamed KL & JSD
+│   ├── teacher.py                     # frozen teacher + CPU logit cache
+│   ├── trainer.py                     # three-phase training loop
+│   ├── data.py                        # JSONL prompt dataset + left-pad collator
+│   ├── utils.py                       # response-mask + shift helpers
+│   └── rollout/
+│       ├── base.py                    # RolloutEngine ABC, request/batch dataclasses
+│       ├── hybrid_engine.py           # DeepSpeed hybrid-engine rollout
+│       └── vllm.py                    # vLLM rollout on disjoint GPUs
+├── configs/
+│   ├── ds_zero3.json                  # base DeepSpeed ZeRO-3 + hybrid engine
+│   ├── opsd_hybrid_engine.json        # production-ish hybrid-engine OPSD config
+│   ├── opsd_vllm_disjoint.json        # vLLM rollout on a disjoint GPU group
+│   ├── smoke_hybrid.json              # 5-step smoke test with Qwen2.5-0.5B / 1.5B
+│   ├── smoke_vllm.json                # same but with vLLM rollout
+│   └── smoke_ds_zero3.json            # ZeRO-3 config tuned for smoke runs
+├── scripts/
+│   ├── train_opsd_hybrid.sh           # launch hybrid-engine training
+│   └── train_opsd_vllm.sh             # launch vLLM training
+└── tests/                             # CPU-only unit tests (run with pytest)
+```
+
+## Quick start
+
+### Install
+
+```
+pip install deepspeed transformers datasets accelerate
+# Optional, only for the vLLM rollout backend:
+pip install 'vllm>=0.6.4'
+```
+
+### Hybrid-engine training (single-node, no vLLM)
+
+```
+cd examples/opsd
+NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json
+```
+
+The hybrid engine path lives entirely within DeepSpeed: the student engine
+both trains and generates, sharing weights without a copy step. Easiest to
+get running; slower generation than vLLM.
+
+### vLLM training (disjoint GPU group)
+
+```
+cd examples/opsd
+# Train on GPUs 0..5, run vLLM on 6,7 (matches default config)
+NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \
+    bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json
+```
+
+vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0
+constructs the `LLM` handle; other training ranks receive generated token
+ids via NCCL broadcast.
+
+### Smoke tests (5 steps, small models)
+
+The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B
+(student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated
+end-to-end before scaling up.
+
+```
+cd examples/opsd
+deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json
+# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM):
+NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \
+    main.py --config configs/smoke_vllm.json
+```
+
+## Unit tests
+
+The CPU-runnable test suite exercises the loss math, teacher caching, rollout
+contract, and vLLM stitch logic. Run with:
+
+```
+cd examples/opsd
+python -m pytest tests/ -v
+```
+
+## Configuration
+
+`OPSDConfig` is a plain dataclass loaded from JSON (no Hydra). The schema:
+
+```json
+{
+  "student":    { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" },
+  "teacher":    { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true },
+  "rollout":    { "engine": "hybrid_engine | vllm", ... },
+  "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 },
+  "training":   { "train_batch_size": 8, "learning_rate": 1e-6, ... },
+  "data":       { "path": "data/prompts.jsonl", "prompt_field": "prompt" },
+  "deepspeed_config": "configs/ds_zero3.json"
+}
+```
+
+See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json`
+for fully-populated examples.
+
+## Adding a new model architecture
+
+No special steps are needed for new model architectures. vLLM's RLHF weight
+transfer API handles TP slicing internally; the caller only needs to send full
+tensors.
+
+## Design notes
+
+* **Why CPU-cache the teacher logits?** Holding both student and teacher
+  `[B, T, V]` tensors on GPU at once doubles memory pressure. Staging the
+  teacher to host between the teacher forward and the student backward halves
+  the worst-case GPU footprint of the loss path. The streamed loss
+  (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU
+  one sequence slice at a time so the full tensor never re-materialises.
+
+* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends
+  have very different lifecycles (hybrid engine reads student weights live;
+  vLLM holds its own copy and must be synced) but the trainer should not
+  care. The ABC keeps the trainer engine-agnostic so additional backends
+  (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching
+  the loop.
+
+* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is
+  simpler to debug — failures in vLLM don't take down training and vice
+  versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as
+  a follow-up.
+
+* **Weight sync uses vLLM's RLHF API.** vLLM 0.22.0+ exposes
+  ``/update_weights`` which handles TP slicing internally. The trainer
+  sends full tensors and vLLM distributes them.
+
+## vLLM status
+
+The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but
+not yet usable under the DeepSpeed launcher**. During live validation on
+4× H200 we hit a blocking issue:
+
+> vLLM's worker init calls `new_group(...)` on the global process group as
+> a collective. Under `deepspeed --num_gpus N`, the world is all `N`
+> training ranks but only rank 0 calls into vLLM, so the constructor hangs
+> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 +
+> torch 2.5.1. Standalone vLLM (world size 1) works in seconds.
+
+The fix requires running vLLM in a **separate top-level Python process**
+with its own world, accessed over HTTP/RPC from the trainer — the pattern
+used by TRL and OpenRLHF. That's a larger refactor than fits in this PR;
+the current `VLLMRollout` will be the basis for it once landed.
+
+What's verified for the vLLM path today:
+* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test)
+* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated)
+
+What's **not** verified:
+* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig`
+* `LLM.collective_rpc("load_weights", ...)` weight sync at training time
+
+The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated
+end-to-end on the same hardware.
+
+## Other known limitations (v1)
+
+* **vLLM weight sync (when it works) goes through pickle** —
+  `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`.
+  Expect several seconds per sync on a 7B model. A faster v2 would broadcast
+  tensors via NCCL on a shared trainer↔vLLM process group — see verl's
+  `bucketed_weight_transfer.py` for a reference design.
+* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's
+  slicing math is unit-tested but no live run exists.
+* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not
+  ported. Easy to add: scale `per_tok` by a reward weight in the loss path.
+* **GRPO and other on-policy RL recipes** are out of scope. The
+  `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO
+  trainer would add its own advantage / KL-to-reference logic on top.
+* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed.
+* **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no
+  hybrid-engine inference acceleration), since DeepSpeed's inference policy
+  list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15.
+  The fallback gathers params via `GatheredParameters` and calls the HF
+  model's `generate` directly — correct, just ~3-5x slower than the
+  accelerated path.
+
+## References
+
+* OPSD reference repo: <https://github.com/HJSang/OPSD_OnPolicyDistillation>
+* DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py`
+* verl rollout / weight-sync design (used as a cross-check):
+  <https://github.com/volcengine/verl/tree/main/verl/workers/rollout/vllm_rollout>
diff --git a/training/opsd/configs/ds_zero3.json b/training/opsd/configs/ds_zero3.json
new file mode 100644
index 000000000..1f43339a6
--- /dev/null
+++ b/training/opsd/configs/ds_zero3.json
@@ -0,0 +1,43 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e7,
+        "stage3_prefetch_bucket_size": 5e7,
+        "stage3_param_persistence_threshold": 1e6,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-6,
+            "betas": [0.9, 0.95],
+            "eps": 1e-8,
+            "weight_decay": 0.0
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 1e-6,
+            "warmup_num_steps": 0
+        }
+    },
+    "gradient_clipping": 1.0,
+    "hybrid_engine": {
+        "enabled": true,
+        "max_out_tokens": 2048,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json
new file mode 100644
index 000000000..d2ebb8b03
--- /dev/null
+++ b/training/opsd/configs/opsd_hybrid_engine.json
@@ -0,0 +1,48 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": true
+    },
+    "rollout": {
+        "engine": "hybrid_engine",
+        "max_prompt_length": 1024,
+        "max_response_length": 1024,
+        "temperature": 0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "weight_sync_interval": 1
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 0,
+        "chunk_size": 512
+    },
+    "training": {
+        "train_batch_size": 1,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": -1,
+        "warmup_steps": 0,
+        "save_steps": 500,
+        "logging_steps": 10,
+        "save_dir": "./opsd_ckpt_hybrid",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/ds_zero3.json"
+}
diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json
new file mode 100644
index 000000000..c98489df6
--- /dev/null
+++ b/training/opsd/configs/opsd_vllm_disjoint.json
@@ -0,0 +1,54 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": true
+    },
+    "rollout": {
+        "engine": "vllm",
+        "max_prompt_length": 1024,
+        "max_response_length": 1024,
+        "temperature": 0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "gpus": [6, 7],
+        "tensor_parallel_size": 2,
+        "gpu_memory_utilization": 0.85,
+        "vllm_dtype": "bfloat16",
+        "weight_sync_interval": 4,
+        "vllm_min_version": "0.6.4",
+        "vllm_port": 8000
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 0,
+        "chunk_size": 512
+    },
+    "training": {
+        "train_batch_size": 1,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": -1,
+        "warmup_steps": 0,
+        "save_steps": 500,
+        "logging_steps": 10,
+        "save_dir": "./opsd_ckpt_vllm",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/ds_zero3.json"
+}
diff --git a/training/opsd/configs/smoke_ds_zero0.json b/training/opsd/configs/smoke_ds_zero0.json
new file mode 100644
index 000000000..26d9e8495
--- /dev/null
+++ b/training/opsd/configs/smoke_ds_zero0.json
@@ -0,0 +1,20 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 0
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-6,
+            "betas": [0.9, 0.95],
+            "eps": 1e-8,
+            "weight_decay": 0.0,
+            "torch_adam": true
+        }
+    },
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false
+}
diff --git a/training/opsd/configs/smoke_ds_zero3.json b/training/opsd/configs/smoke_ds_zero3.json
new file mode 100644
index 000000000..74211f3fb
--- /dev/null
+++ b/training/opsd/configs/smoke_ds_zero3.json
@@ -0,0 +1,35 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e7,
+        "stage3_prefetch_bucket_size": 5e7,
+        "stage3_param_persistence_threshold": 1e6,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-6,
+            "betas": [0.9, 0.95],
+            "eps": 1e-8,
+            "weight_decay": 0.0
+        }
+    },
+    "gradient_clipping": 1.0,
+    "hybrid_engine": {
+        "enabled": true,
+        "max_out_tokens": 512,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json
new file mode 100644
index 000000000..774092926
--- /dev/null
+++ b/training/opsd/configs/smoke_hybrid.json
@@ -0,0 +1,48 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": false
+    },
+    "rollout": {
+        "engine": "hybrid_engine",
+        "max_prompt_length": 128,
+        "max_response_length": 64,
+        "temperature": 0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "weight_sync_interval": 1
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 0,
+        "chunk_size": 128
+    },
+    "training": {
+        "train_batch_size": 1,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 0,
+        "save_steps": 10000,
+        "logging_steps": 1,
+        "save_dir": "./opsd_smoke_hybrid_ckpt",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/smoke_ds_zero0.json"
+}
diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json
new file mode 100644
index 000000000..0512c1581
--- /dev/null
+++ b/training/opsd/configs/smoke_hybrid_gc.json
@@ -0,0 +1,49 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": false
+    },
+    "rollout": {
+        "engine": "hybrid_engine",
+        "max_prompt_length": 128,
+        "max_response_length": 64,
+        "temperature": 0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "use_graph_capture": true,
+        "weight_sync_interval": 1
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 1.0,
+        "chunk_size": 128
+    },
+    "training": {
+        "train_batch_size": 1,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 0,
+        "save_steps": 10000,
+        "logging_steps": 1,
+        "save_dir": "./opsd_smoke_gc_ckpt",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/smoke_ds_zero0.json"
+}
diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json
new file mode 100644
index 000000000..fe375e602
--- /dev/null
+++ b/training/opsd/configs/smoke_vllm.json
@@ -0,0 +1,57 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": false
+    },
+    "rollout": {
+        "engine": "vllm",
+        "max_prompt_length": 128,
+        "max_response_length": 64,
+        "temperature": 0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "gpus": [],
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.3,
+        "vllm_dtype": "bfloat16",
+        "weight_sync_interval": 2,
+        "vllm_min_version": "0.6.4",
+        "vllm_enforce_eager": true,
+        "vllm_port": 8000,
+        "vllm_python": "/root/miniconda3/envs/vllm/bin/python",
+        "weight_transfer_backend": "gdr"
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 0,
+        "chunk_size": 128
+    },
+    "training": {
+        "train_batch_size": 1,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 0,
+        "save_steps": 10000,
+        "logging_steps": 1,
+        "save_dir": "./opsd_smoke_vllm_ckpt",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/smoke_ds_zero0.json"
+}
diff --git a/training/opsd/data/prompts.jsonl b/training/opsd/data/prompts.jsonl
new file mode 100644
index 000000000..bf0dba878
--- /dev/null
+++ b/training/opsd/data/prompts.jsonl
@@ -0,0 +1,238 @@
+{"prompt": "Solve: 17 + 25 = ?"}
+{"prompt": "What is 12 multiplied by 8?"}
+{"prompt": "If a train travels 60 miles per hour for 3 hours, how far does it go?"}
+{"prompt": "What is the square root of 144?"}
+{"prompt": "Compute 15% of 240."}
+{"prompt": "A rectangle has length 7 and width 4. What is its area?"}
+{"prompt": "Solve for x: 2x + 5 = 17."}
+{"prompt": "What is 7 factorial?"}
+{"prompt": "Compute the sum of integers from 1 to 10."}
+{"prompt": "What is 2 to the power of 10?"}
+{"prompt": "Find the perimeter of a square with side length 9."}
+{"prompt": "If 5 apples cost 2.50, what is the cost of 12 apples?"}
+{"prompt": "What is the greatest common divisor of 24 and 36?"}
+{"prompt": "Convert 0.75 to a fraction in simplest form."}
+{"prompt": "If x + y = 10 and x - y = 4, find x and y."}
+{"prompt": "What is 1/4 + 1/3?"}
+{"prompt": "A circle has radius 5. What is its area?"}
+{"prompt": "Compute (3 + 4) * (5 - 2)."}
+{"prompt": "What is 81 divided by 9?"}
+{"prompt": "If a number doubled is 18, what is the number?"}
+{"prompt": "What is 3/5 expressed as a percentage?"}
+{"prompt": "Calculate the area of a triangle with base 10 and height 6."}
+{"prompt": "What is the least common multiple of 4 and 6?"}
+{"prompt": "If a shirt costs 25 after a 20% discount, what was the original price?"}
+{"prompt": "Simplify: 2(3x + 4) - x."}
+{"prompt": "What is the value of pi rounded to 4 decimal places?"}
+{"prompt": "How many sides does a hexagon have?"}
+{"prompt": "Compute 2^3 + 3^2."}
+{"prompt": "If you roll a standard die, what is the probability of getting a 4?"}
+{"prompt": "What is the average of 12, 15, and 18?"}
+{"prompt": "Solve: 5x - 3 = 22."}
+{"prompt": "What is the volume of a cube with side length 4?"}
+{"prompt": "Convert 3 kilometers to meters."}
+{"prompt": "What is 13 squared?"}
+{"prompt": "If a car uses 8 liters per 100km, how much for 350km?"}
+{"prompt": "What is the median of 3, 7, 9, 12, 15?"}
+{"prompt": "Calculate 25 * 4 + 30 / 6."}
+{"prompt": "What is the factorial of 5?"}
+{"prompt": "If 3x = 27, what is x?"}
+{"prompt": "What is 10% of 0.5?"}
+{"prompt": "Simplify the fraction 18/24."}
+{"prompt": "What is the next prime number after 7?"}
+{"prompt": "How many degrees are in a right angle?"}
+{"prompt": "Compute 1/2 * 3/4."}
+{"prompt": "What is the surface area of a cube with side 3?"}
+{"prompt": "If a population grows by 10% per year from 1000, what is it after 2 years?"}
+{"prompt": "What is the absolute value of -15?"}
+{"prompt": "Solve: x^2 = 49."}
+{"prompt": "How many minutes are in 2.5 hours?"}
+{"prompt": "What is 0.1 + 0.02 + 0.003?"}
+{"prompt": "A bag has 3 red and 5 blue marbles. What is the probability of picking red?"}
+{"prompt": "What is the perimeter of a rectangle with sides 8 and 12?"}
+{"prompt": "Compute the cube root of 27."}
+{"prompt": "If y = 2x + 1 and x = 5, what is y?"}
+{"prompt": "What is the difference between 100 and 37?"}
+{"prompt": "How many edges does a rectangular prism have?"}
+{"prompt": "Simplify: (x + 2)(x - 2)."}
+{"prompt": "What is 4! divided by 2!?"}
+{"prompt": "Convert 5/8 to a decimal."}
+{"prompt": "What is the hypotenuse of a right triangle with legs 3 and 4?"}
+{"prompt": "What is 999 + 1?"}
+{"prompt": "If you save 5 per day, how much in 30 days?"}
+{"prompt": "What is the reciprocal of 7?"}
+{"prompt": "Compute log10(1000)."}
+{"prompt": "A pizza is cut into 8 equal slices. If you eat 3, what fraction remains?"}
+{"prompt": "What is the sum of angles in a triangle?"}
+{"prompt": "Round 3.14159 to 2 decimal places."}
+{"prompt": "What is 50% of 50% of 200?"}
+{"prompt": "If a = 3 and b = 4, what is a^2 + b^2?"}
+{"prompt": "How many factors does 12 have?"}
+{"prompt": "What is the negative of -7?"}
+{"prompt": "Express 0.125 as a fraction."}
+{"prompt": "What is the slope of the line y = 3x + 5?"}
+{"prompt": "A clock shows 3:15. What is the angle between the hour and minute hands?"}
+{"prompt": "What is 11 * 11?"}
+{"prompt": "If gas costs 3.50 per gallon and you buy 10 gallons, what is the total?"}
+{"prompt": "What are the first 3 multiples of 7?"}
+{"prompt": "How many zeros are in one million?"}
+{"prompt": "What is 2/3 + 2/3?"}
+{"prompt": "Compute the area of a circle with diameter 10."}
+{"prompt": "If a book has 300 pages and you read 45 per day, how many days to finish?"}
+{"prompt": "What is the value of 5^0?"}
+{"prompt": "Solve: 4(x - 1) = 20."}
+{"prompt": "What is the complement of a 35 degree angle?"}
+{"prompt": "How many distinct permutations of the word MATH?"}
+{"prompt": "What is 1/10 as a percentage?"}
+{"prompt": "If temperature drops from 15C to -3C, what is the change?"}
+{"prompt": "What is the greatest common factor of 18 and 30?"}
+{"prompt": "A train is 200m long traveling at 20m/s. How long to pass a pole?"}
+{"prompt": "What is the sum of the first 5 odd numbers?"}
+{"prompt": "Convert 45 degrees Celsius to Fahrenheit."}
+{"prompt": "What is 0.001 * 1000?"}
+{"prompt": "How many diagonals does a pentagon have?"}
+{"prompt": "Simplify: 6 + 3 * 2."}
+{"prompt": "What is 20% of 20% of 500?"}
+{"prompt": "If you flip a coin 3 times, how many possible outcomes?"}
+{"prompt": "What is the ratio of 15 to 25 in simplest form?"}
+{"prompt": "Find x if 3/5 = x/25."}
+{"prompt": "What is the mean of 2, 4, 6, 8, 10?"}
+{"prompt": "What is 7 * 8 + 6 / 2?"}
+{"prompt": "A cylinder has radius 3 and height 10. What is its volume?"}
+{"prompt": "What is the smallest prime number?"}
+{"prompt": "If f(x) = x^2 + 1, what is f(3)?"}
+{"prompt": "How many seconds in one hour?"}
+{"prompt": "What is the result of 100 mod 7?"}
+{"prompt": "Simplify: sqrt(50) / sqrt(2)."}
+{"prompt": "What is the distance between points (1,2) and (4,6)?"}
+{"prompt": "A recipe needs 2 cups flour for 12 cookies. How many cups for 30 cookies?"}
+{"prompt": "What is 1.5 * 2.5?"}
+{"prompt": "What is A intersection B if A = {1,2,3} and B = {2,3,4}?"}
+{"prompt": "What is the 10th term of the arithmetic sequence 3, 7, 11, 15?"}
+{"prompt": "How many cubic centimeters in a cubic meter?"}
+{"prompt": "What is the value of 2^10?"}
+{"prompt": "Solve the inequality: 2x > 10."}
+{"prompt": "What is 3/7 rounded to 2 decimal places?"}
+{"prompt": "What is the tangent of 45 degrees?"}
+{"prompt": "How many ways to choose 2 items from 5?"}
+{"prompt": "What is the product of all integers from 1 to 5?"}
+{"prompt": "If 8 workers finish a job in 6 days, how many days for 12 workers?"}
+{"prompt": "What is 1000 - 587?"}
+{"prompt": "Express 2500 in scientific notation."}
+{"prompt": "What is the sum of interior angles of a hexagon?"}
+{"prompt": "What is the decimal equivalent of the binary number 1010?"}
+{"prompt": "What is the area of a trapezoid with bases 6 and 10 and height 4?"}
+{"prompt": "Calculate 15 * 15."}
+{"prompt": "What is the supplementary angle of 110 degrees?"}
+{"prompt": "A store buys an item for 40 and sells for 60. What is the markup percentage?"}
+{"prompt": "Solve: |x - 3| = 5."}
+{"prompt": "How many days are in a leap year?"}
+{"prompt": "What is the compound interest on 1000 at 10% for 2 years?"}
+{"prompt": "If the base of a triangle is 8 and height is 5, what is the area?"}
+{"prompt": "What is 100 divided by 3 rounded to 2 decimal places?"}
+{"prompt": "What is the 7th Fibonacci number?"}
+{"prompt": "Convert 1 mile to feet."}
+{"prompt": "What is the LCM of 6, 8, and 12?"}
+{"prompt": "Simplify: 4(x + 3) - 2(x - 1)."}
+{"prompt": "If 3a + 2b = 16 and a = 4, what is b?"}
+{"prompt": "What is the sine of 30 degrees?"}
+{"prompt": "How many ways can 4 people sit in a row?"}
+{"prompt": "What is 0.5^3?"}
+{"prompt": "Find the 20th term of 5, 8, 11, 14"}
+{"prompt": "A triangle has sides 3, 4, 5. What type of triangle is it?"}
+{"prompt": "What is the absolute difference between -5 and 3?"}
+{"prompt": "How many grams in 2.5 kilograms?"}
+{"prompt": "What is the product of -3 and -7?"}
+{"prompt": "If a clock shows 9:00, what is the angle between the hands?"}
+{"prompt": "What is the square root of 81?"}
+{"prompt": "What is 1/3 + 1/6 + 1/12?"}
+{"prompt": "If x^2 - 4 = 0, what are the solutions?"}
+{"prompt": "What is the geometric mean of 4 and 16?"}
+{"prompt": "Convert 72 km/h to m/s."}
+{"prompt": "What is the value of cos(60 degrees)?"}
+{"prompt": "A box has 5 red, 3 green, 2 blue balls. What is P(not red)?"}
+{"prompt": "What is 2^0 + 2^1 + 2^2 + 2^3?"}
+{"prompt": "Find the slope between points (1, 3) and (4, 9)."}
+{"prompt": "What is the sum of the first 20 natural numbers?"}
+{"prompt": "What is the value of e rounded to 3 decimal places?"}
+{"prompt": "How many total degrees in a quadrilateral?"}
+{"prompt": "Simplify: (2^3 * 2^4) / 2^5."}
+{"prompt": "What is the probability of drawing a king from a standard deck?"}
+{"prompt": "A car travels 180 miles in 3 hours. What is its average speed?"}
+{"prompt": "What is the decimal 0.375 as a fraction?"}
+{"prompt": "Solve: 2(x + 5) = 3(x - 1)."}
+{"prompt": "How many milliliters in 3 liters?"}
+{"prompt": "What is the cube of 5?"}
+{"prompt": "What is 5/6 as a repeating decimal?"}
+{"prompt": "Find the circumference of a circle with radius 7."}
+{"prompt": "If 2 pipes fill a tank in 6 and 12 hours, how long together?"}
+{"prompt": "What is the coefficient of x in 3x^2 + 5x - 7?"}
+{"prompt": "What is the result of (10^3) / (10^-1)?"}
+{"prompt": "Find the GCD of 48 and 72."}
+{"prompt": "What is the domain of f(x) = sqrt(x)?"}
+{"prompt": "Simplify: 8/12 - 3/12."}
+{"prompt": "What is the arithmetic mean of the first 10 even numbers?"}
+{"prompt": "Convert -40 Celsius to Fahrenheit."}
+{"prompt": "What is the median of 1, 3, 5, 7, 9, 11?"}
+{"prompt": "What is the next number: 1, 1, 2, 3, 5, 8, 13?"}
+{"prompt": "If 4 workers can paint a fence in 8 hours, how long for 2 workers?"}
+{"prompt": "What is the cosine of 0 degrees?"}
+{"prompt": "A polygon has 9 sides. What is the sum of its interior angles?"}
+{"prompt": "What is 1.2 * 10^3 in standard form?"}
+{"prompt": "What is the range of the data set 5, 8, 3, 12, 7?"}
+{"prompt": "What is the LCM of 4, 5, and 6?"}
+{"prompt": "If y varies directly as x and y = 10 when x = 2, find y when x = 7."}
+{"prompt": "What is the degree of the polynomial 3x^4 + 2x^2 - x + 5?"}
+{"prompt": "How many diagonals does a hexagon have?"}
+{"prompt": "What is 75% expressed as a fraction in lowest terms?"}
+{"prompt": "How many ounces in 3 pounds?"}
+{"prompt": "What is the volume of a sphere with radius 3?"}
+{"prompt": "Solve the system: x + y = 8, x - y = 2."}
+{"prompt": "A triangle has two angles of 50 and 70 degrees. What is the third angle?"}
+{"prompt": "What is the remainder when 100 is divided by 7?"}
+{"prompt": "Express 0.04 as a percentage."}
+{"prompt": "What is the value of the expression 2 + 3 * 4 - 1?"}
+{"prompt": "How many prime numbers are between 10 and 30?"}
+{"prompt": "If a laptop costs 800 after 20% off, what was the original price?"}
+{"prompt": "What is 5 factorial minus 3 factorial?"}
+{"prompt": "Find the length of the diagonal of a rectangle 6 by 8."}
+{"prompt": "What is the sine of 90 degrees?"}
+{"prompt": "If the ratio of boys to girls is 3:2 and there are 30 students, how many girls?"}
+{"prompt": "What is the value of log2(32)?"}
+{"prompt": "What is the sum of 1 + 2 + 3 ... + 50?"}
+{"prompt": "Convert 40 inches to feet."}
+{"prompt": "What is the derivative of x^3?"}
+{"prompt": "What is 10^0 + 10^1 + 10^2?"}
+{"prompt": "A bag has 4 green, 6 red marbles. What is P(green or red)?"}
+{"prompt": "How many multiples of 3 are between 10 and 50?"}
+{"prompt": "If 2x + y = 7 and x = 3, what is y?"}
+{"prompt": "What is the midpoint of the segment from (2,3) to (8,7)?"}
+{"prompt": "Simplify: 2(3x - 1) + 4(x + 2)."}
+{"prompt": "How many triangles can be formed from 6 non-collinear points?"}
+{"prompt": "What is the 5th root of 32?"}
+{"prompt": "What is the mode of 3, 5, 3, 7, 5, 3, 8?"}
+{"prompt": "Find the slope of the line passing through (0,0) and (2,6)."}
+{"prompt": "What is the supplementary angle of 72 degrees?"}
+{"prompt": "How many positive divisors does 36 have?"}
+{"prompt": "Simplify: (a + b)^2 - (a - b)^2."}
+{"prompt": "How many seconds in 1.5 hours?"}
+{"prompt": "If a machine produces 120 items in 8 hours, how many per hour?"}
+{"prompt": "What is the inverse of f(x) = 2x + 3?"}
+{"prompt": "What is the greatest integer less than sqrt(50)?"}
+{"prompt": "What is 4^3 - 3^4?"}
+{"prompt": "What is the distance from (0,0) to (3,4)?"}
+{"prompt": "If sin(x) = 0.5, what is x in degrees?"}
+{"prompt": "A rectangle has area 48 and width 6. What is its length?"}
+{"prompt": "How many degrees does the minute hand move in 20 minutes?"}
+{"prompt": "What is the probability of rolling a sum of 7 with two dice?"}
+{"prompt": "Simplify: 3(x + 2) - 2(x - 4)."}
+{"prompt": "What is the value of floor(3.7)?"}
+{"prompt": "What is the weighted average of 80 (weight 3) and 90 (weight 7)?"}
+{"prompt": "Find the y-intercept of y = 3x - 6."}
+{"prompt": "How many sides does a decagon have?"}
+{"prompt": "What is the integral of 2x dx?"}
+{"prompt": "What is 2 + 2 * 2?"}
+{"prompt": "If a triangle has sides 5, 5, 5, what is it called?"}
+{"prompt": "What is the decimal for 7/8?"}
+{"prompt": "If f(x) = 1/x, what is f(5)?"}
+{"prompt": "What is the remainder when 2^10 is divided by 7?"}
diff --git a/training/opsd/main.py b/training/opsd/main.py
new file mode 100644
index 000000000..534c8ae0a
--- /dev/null
+++ b/training/opsd/main.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""OPSD training entry point.
+
+Launch with the DeepSpeed launcher::
+
+    deepspeed --num_gpus 8 main.py --config configs/opsd_hybrid_engine.json
+
+The DeepSpeed launcher sets ``LOCAL_RANK``, ``RANK``, and ``WORLD_SIZE`` in
+the environment; we call :func:`deepspeed.init_distributed` to take that over.
+"""
+
+import argparse
+import json
+import os
+import random
+
+import deepspeed
+import numpy as np
+import torch
+from deepspeed.accelerator import get_accelerator
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.runtime.rlhf.config import OPSDConfig
+from deepspeed.runtime.rlhf.data import LeftPaddedPromptCollator, PromptDataset
+from deepspeed.runtime.rollout import build_rollout
+from deepspeed.runtime.rlhf.teacher import TeacherWrapper
+from deepspeed.runtime.rlhf.trainer.opsd import OPSDTrainer
+
+
+def _seed_everything(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if get_accelerator().is_available():
+        get_accelerator().manual_seed_all(seed)
+
+
+def _resolve_dtype(name: str) -> torch.dtype:
+    return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
+
+
+def _load_ds_config(path: str) -> dict:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="Path to OPSDConfig JSON")
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    cfg = OPSDConfig.from_json(args.config)
+    cfg.validate()
+    _seed_everything(cfg.training.seed)
+
+    deepspeed.init_distributed()
+
+    # --- tokenizer (shared between data + rollout) -------------------------
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.student.model_name_or_path,
+        trust_remote_code=cfg.student.trust_remote_code,
+        padding_side="left",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # --- student model + DeepSpeed engine ----------------------------------
+    student_dtype = _resolve_dtype(cfg.student.dtype)
+    student_model = AutoModelForCausalLM.from_pretrained(
+        cfg.student.model_name_or_path,
+        torch_dtype=student_dtype,
+        trust_remote_code=cfg.student.trust_remote_code,
+    )
+
+    ds_config = _load_ds_config(cfg.deepspeed_config)
+    ds_config["train_micro_batch_size_per_gpu"] = cfg.training.micro_batch_size_per_gpu
+    ds_config["train_batch_size"] = cfg.training.train_batch_size
+    ds_config["gradient_accumulation_steps"] = cfg.training.gradient_accumulation_steps
+
+    student_engine, *_ = deepspeed.initialize(
+        model=student_model,
+        model_parameters=student_model.parameters(),
+        config=ds_config,
+    )
+
+    # --- frozen teacher ----------------------------------------------------
+    teacher = TeacherWrapper(cfg.teacher, world_size=dist_world_size())
+
+    # --- rollout engine ----------------------------------------------------
+    rollout = build_rollout(
+        cfg.rollout,
+        student_engine=student_engine,
+        tokenizer=tokenizer,
+        student_model_path=cfg.student.model_name_or_path,
+    )
+
+    # --- dataloader --------------------------------------------------------
+    dataset = PromptDataset(
+        path=cfg.data.path,
+        tokenizer=tokenizer,
+        max_prompt_length=cfg.rollout.max_prompt_length,
+        prompt_field=cfg.data.prompt_field,
+        chat_template=cfg.data.chat_template,
+    )
+    collator = LeftPaddedPromptCollator(tokenizer=tokenizer, max_prompt_length=cfg.rollout.max_prompt_length)
+    loader = DataLoader(
+        dataset,
+        batch_size=cfg.training.micro_batch_size_per_gpu,
+        shuffle=cfg.data.shuffle,
+        collate_fn=collator,
+        drop_last=True,
+    )
+
+    OPSDTrainer(
+        cfg=cfg,
+        student_engine=student_engine,
+        teacher=teacher,
+        tokenizer=tokenizer,
+        rollout=rollout,
+        dataloader=loader,
+    ).train()
+
+
+def dist_world_size() -> int:
+    return int(os.environ.get("WORLD_SIZE", "1"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/requirements.txt b/training/opsd/requirements.txt
new file mode 100644
index 000000000..fb5a09157
--- /dev/null
+++ b/training/opsd/requirements.txt
@@ -0,0 +1,5 @@
+datasets>=2.0.0
+numpy
+transformers>=4.40.0
+# Optional, only needed when rollout.engine == "vllm":
+# vllm>=0.6.4
diff --git a/training/opsd/scripts/train_opsd_hybrid.sh b/training/opsd/scripts/train_opsd_hybrid.sh
new file mode 100644
index 000000000..69e3bdc68
--- /dev/null
+++ b/training/opsd/scripts/train_opsd_hybrid.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+#
+# Launch OPSD training with the DeepSpeed hybrid-engine rollout (no vLLM).
+# Assumes you're cd'd into examples/opsd/.
+set -euo pipefail
+
+CONFIG="${1:-configs/opsd_hybrid_engine.json}"
+NUM_GPUS="${NUM_GPUS:-8}"
+
+deepspeed --num_gpus "${NUM_GPUS}" main.py --config "${CONFIG}"
diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh
new file mode 100644
index 000000000..6ad847954
--- /dev/null
+++ b/training/opsd/scripts/train_opsd_vllm.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+#
+# Launch OPSD training with vLLM rollout.
+#
+# The vLLM server is started **lazily** as a subprocess by training rank 0
+# on first use, so no separate vLLM launch step is required.  The GPUs
+# listed in ``rollout.gpus`` in the config are assigned to the vLLM server
+# via ``CUDA_VISIBLE_DEVICES`` in the subprocess environment.
+#
+# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7
+# run vLLM with TP=2.  Adjust configs/opsd_vllm_disjoint.json::rollout.gpus
+# and NUM_TRAIN_GPUS to match your topology.
+set -euo pipefail
+
+CONFIG="${1:-configs/opsd_vllm_disjoint.json}"
+NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}"
+INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}"
+
+deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \
+    main.py --config "${CONFIG}"
diff --git a/training/opsd/tests/test_losses.py b/training/opsd/tests/test_losses.py
new file mode 100644
index 000000000..41ea92289
--- /dev/null
+++ b/training/opsd/tests/test_losses.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only numerics tests for the distillation divergences.
+
+These exercise the loss math without needing GPUs, models, or a torchrun
+launcher. Run from the example root with::
+
+    cd examples/opsd && python -m pytest tests/test_losses.py -v
+"""
+
+import pytest
+import torch
+
+from deepspeed.runtime.rlhf.losses import chunked_distillation_loss, per_token_logprobs
+from deepspeed.runtime.rlhf.utils import build_response_mask, shift_for_next_token_prediction
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_zero_when_identical(loss_type):
+    torch.manual_seed(0)
+    logits = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(logits, logits.clone(), mask, loss_type=loss_type)
+    assert loss.item() == pytest.approx(0.0, abs=1e-5)
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_positive_when_different(loss_type):
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type=loss_type)
+    assert loss.item() > 0.0
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_chunking_equivalent_to_unchunked(loss_type):
+    torch.manual_seed(0)
+    s = torch.randn(2, 100, 32)
+    t = torch.randn(2, 100, 32)
+    mask = torch.ones(2, 100)
+    loss_chunked = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10)
+    loss_whole = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10_000)
+    assert torch.allclose(loss_chunked, loss_whole, atol=1e-5)
+
+
+def test_mask_excludes_tokens():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    half_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float32)
+    loss_direct = chunked_distillation_loss(s[:, :4], t[:, :4], torch.ones(2, 4), loss_type="reverse_kl")
+    loss_masked = chunked_distillation_loss(s, t, half_mask, loss_type="reverse_kl")
+    assert torch.allclose(loss_direct, loss_masked, atol=1e-5)
+
+
+def test_gradient_flows_to_student():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32, requires_grad=True)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
+    loss.backward()
+    assert s.grad is not None
+    assert s.grad.abs().sum().item() > 0
+
+
+def test_gradient_does_not_flow_to_teacher_when_detached():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32, requires_grad=True)
+    t = torch.randn(2, 8, 32, requires_grad=True)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t.detach(), mask, loss_type="reverse_kl")
+    loss.backward()
+    assert t.grad is None
+
+
+def test_unknown_loss_type_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 4, 8)
+    mask = torch.ones(2, 4)
+    with pytest.raises(ValueError, match="Unknown loss_type"):
+        chunked_distillation_loss(s, t, mask, loss_type="totally_made_up")
+
+
+def test_shape_mismatch_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 5, 8)
+    mask = torch.ones(2, 4)
+    with pytest.raises(ValueError, match="shape mismatch"):
+        chunked_distillation_loss(s, t, mask)
+
+
+def test_mask_shape_mismatch_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 4, 8)
+    mask = torch.ones(2, 5)
+    with pytest.raises(ValueError, match="does not match"):
+        chunked_distillation_loss(s, t, mask)
+
+
+@pytest.mark.parametrize("temperature", [0.5, 1.0, 2.0])
+def test_temperature_changes_loss_but_stays_finite(temperature):
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", temperature=temperature)
+    assert torch.isfinite(loss).item()
+
+
+def test_jsd_is_symmetric():
+    torch.manual_seed(0)
+    a = torch.randn(2, 8, 32)
+    b = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    jsd_ab = chunked_distillation_loss(a, b, mask, loss_type="jsd")
+    jsd_ba = chunked_distillation_loss(b, a, mask, loss_type="jsd")
+    assert torch.allclose(jsd_ab, jsd_ba, atol=1e-5)
+
+
+def test_all_zero_mask_returns_zero():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.zeros(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
+    assert loss.item() == pytest.approx(0.0, abs=1e-6)
+
+
+def test_per_token_logprobs_matches_manual():
+    torch.manual_seed(0)
+    logits = torch.randn(2, 4, 16)
+    labels = torch.randint(0, 16, (2, 4))
+    got = per_token_logprobs(logits, labels)
+    expected = torch.log_softmax(logits.float(), dim=-1)
+    expected = expected.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
+    assert torch.allclose(got, expected, atol=1e-6)
+
+
+def test_build_response_mask_basic():
+    attention_mask = torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
+    response_start_idx = torch.tensor([2, 3])
+    resp = build_response_mask(response_start_idx, attention_mask)
+    expected = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]])
+    assert torch.equal(resp, expected)
+
+
+def test_build_response_mask_validates_shapes():
+    with pytest.raises(ValueError, match="response_start_idx must be 1-D"):
+        build_response_mask(torch.zeros(2, 2), torch.ones(2, 4))
+    with pytest.raises(ValueError, match="attention_mask must be 2-D"):
+        build_response_mask(torch.zeros(2), torch.ones(4))
+    with pytest.raises(ValueError, match="batch"):
+        build_response_mask(torch.zeros(3), torch.ones(2, 4))
+
+
+def test_shift_for_next_token_prediction_shapes():
+    logits = torch.randn(2, 5, 8)
+    labels = torch.randint(0, 8, (2, 5))
+    sl, sla = shift_for_next_token_prediction(logits, labels)
+    assert sl.shape == (2, 4, 8)
+    assert sla.shape == (2, 4)
diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py
new file mode 100644
index 000000000..36d2fcea8
--- /dev/null
+++ b/training/opsd/tests/test_teacher_caching.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only tests for TeacherLogitCache.
+
+The ``TeacherWrapper`` itself (which wraps deepspeed+transformers) is not
+exercised here because it requires a real model and a DeepSpeed launcher; the
+caching/streaming pieces are isolated into ``TeacherLogitCache`` so they can
+be tested in isolation.
+"""
+
+import pytest
+import torch
+
+from deepspeed.runtime.rlhf.teacher import TeacherLogitCache
+
+
+def test_round_trip_preserves_values_within_dtype():
+    torch.manual_seed(0)
+    gpu_like = torch.randn(2, 16, 32, dtype=torch.float32)
+    cache = TeacherLogitCache.from_gpu_logits(gpu_like, store_dtype=torch.bfloat16)
+    assert cache.shape == (2, 16, 32)
+    assert cache.dtype == torch.bfloat16
+    chunk = cache.chunk_to_device(0, 16, torch.device("cpu"), dtype=torch.float32)
+    # bf16 round-trip loses precision; check it stays within bf16's worst-case
+    # relative error rather than asserting exact equality.
+    assert torch.allclose(chunk, gpu_like, atol=1e-1, rtol=1e-1)
+
+
+def test_chunk_slicing_is_correct():
+    torch.manual_seed(0)
+    src = torch.randn(3, 100, 8)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    for start, end in [(0, 10), (10, 50), (50, 100), (33, 77)]:
+        got = cache.chunk_to_device(start, end, torch.device("cpu"))
+        assert got.shape == (3, end - start, 8)
+        assert torch.allclose(got, src[:, start:end])
+
+
+def test_invalid_chunk_bounds_raise():
+    cache = TeacherLogitCache.from_gpu_logits(torch.zeros(1, 8, 4), store_dtype=torch.float32)
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(0, 9, torch.device("cpu"))
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(5, 3, torch.device("cpu"))
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(-1, 4, torch.device("cpu"))
+
+
+def test_rejects_non_3d_logits():
+    with pytest.raises(ValueError, match="must be 3-D"):
+        TeacherLogitCache(cpu_logits=torch.zeros(8, 32))
+
+
+def test_rejects_gpu_resident_logits():
+    if not torch.cuda.is_available():  #ignore-cuda
+        pytest.skip("no CUDA available to construct GPU tensor")
+    with pytest.raises(ValueError, match="must live on CPU"):
+        TeacherLogitCache(cpu_logits=torch.zeros(1, 8, 4, device="cuda"))
+
+
+def test_dtype_override_in_chunk_to_device():
+    src = torch.randn(2, 8, 16, dtype=torch.float32)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    chunk = cache.chunk_to_device(0, 8, torch.device("cpu"), dtype=torch.bfloat16)
+    assert chunk.dtype == torch.bfloat16
+
+
+def test_free_releases_buffer():
+    src = torch.randn(2, 32, 16)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    assert cache.cpu_logits.numel() == 2 * 32 * 16
+    cache.free()
+    assert cache.cpu_logits.numel() == 0
+
+
+def test_default_store_dtype_is_bf16():
+    src = torch.randn(1, 4, 8)
+    cache = TeacherLogitCache.from_gpu_logits(src)
+    assert cache.dtype == torch.bfloat16
+
+
+def test_streamed_chunked_loss_matches_full_loss():
+    """End-to-end check: pulling teacher logits chunk-by-chunk through the
+    cache yields the same distillation loss as passing the full teacher tensor
+    to ``chunked_distillation_loss`` directly."""
+    from deepspeed.runtime.rlhf.losses import chunked_distillation_loss
+
+    torch.manual_seed(0)
+    s = torch.randn(2, 64, 32)
+    t = torch.randn(2, 64, 32)
+    mask = torch.ones(2, 64)
+
+    direct = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", chunk_size=8)
+
+    cache = TeacherLogitCache.from_gpu_logits(t, store_dtype=torch.float32)
+    staged_full = cache.chunk_to_device(0, 64, torch.device("cpu"), dtype=torch.float32)
+    via_cache = chunked_distillation_loss(s, staged_full, mask, loss_type="reverse_kl", chunk_size=8)
+
+    assert torch.allclose(direct, via_cache, atol=1e-6)

From 6b8f5843cc6220d2f41c11c206ae7d85c33fdb79 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Wed, 1 Jul 2026 18:47:14 +0800
Subject: [PATCH 2/8] Use ROLLOUT_VISIBLE_DEVICE env var for vLLM GPU
 placement; rename vllm_dtype to engine_dtype

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/README.md                       | 18 ++++++++++++++----
 training/opsd/configs/opsd_vllm_disjoint.json |  3 +--
 training/opsd/configs/smoke_vllm.json         |  3 +--
 training/opsd/scripts/train_opsd_vllm.sh      | 13 ++++++++-----
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/training/opsd/README.md b/training/opsd/README.md
index 3fce93c36..f0fcc6d99 100644
--- a/training/opsd/README.md
+++ b/training/opsd/README.md
@@ -86,9 +86,11 @@ NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \
     bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json
 ```
 
-vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0
-constructs the `LLM` handle; other training ranks receive generated token
-ids via NCCL broadcast.
+vLLM gets dedicated GPUs via the `ROLLOUT_VISIBLE_DEVICE` environment
+variable (comma-separated CUDA device indices, e.g.
+`ROLLOUT_VISIBLE_DEVICE=6,7`). Training rank 0 spawns the vLLM server as
+a subprocess with `CUDA_VISIBLE_DEVICES` set to those devices; other
+training ranks receive generated token ids via NCCL broadcast.
 
 ### Smoke tests (5 steps, small models)
 
@@ -100,7 +102,8 @@ end-to-end before scaling up.
 cd examples/opsd
 deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json
 # For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM):
-NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \
+NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 ROLLOUT_VISIBLE_DEVICE=2,3 \
+    deepspeed --num_gpus 2 --include localhost:0,1 \
     main.py --config configs/smoke_vllm.json
 ```
 
@@ -133,6 +136,13 @@ python -m pytest tests/ -v
 See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json`
 for fully-populated examples.
 
+**GPU placement for vLLM rollout:** The GPUs available to the vLLM server
+are controlled by the `ROLLOUT_VISIBLE_DEVICE` environment variable
+(comma-separated CUDA device indices, e.g. `ROLLOUT_VISIBLE_DEVICE=6,7`),
+not by a field in the JSON config. This keeps the vLLM device assignment
+decoupled from the DeepSpeed launcher's own `CUDA_VISIBLE_DEVICES` /
+`--include` flags, which control only the training ranks.
+
 ## Adding a new model architecture
 
 No special steps are needed for new model architectures. vLLM's RLHF weight
diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json
index c98489df6..63c1ecc03 100644
--- a/training/opsd/configs/opsd_vllm_disjoint.json
+++ b/training/opsd/configs/opsd_vllm_disjoint.json
@@ -18,10 +18,9 @@
         "top_p": 1.0,
         "top_k": -1,
         "n_samples_per_prompt": 1,
-        "gpus": [6, 7],
         "tensor_parallel_size": 2,
         "gpu_memory_utilization": 0.85,
-        "vllm_dtype": "bfloat16",
+        "engine_dtype": "bfloat16",
         "weight_sync_interval": 4,
         "vllm_min_version": "0.6.4",
         "vllm_port": 8000
diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json
index fe375e602..bda234ca0 100644
--- a/training/opsd/configs/smoke_vllm.json
+++ b/training/opsd/configs/smoke_vllm.json
@@ -18,10 +18,9 @@
         "top_p": 1.0,
         "top_k": -1,
         "n_samples_per_prompt": 1,
-        "gpus": [],
         "tensor_parallel_size": 1,
         "gpu_memory_utilization": 0.3,
-        "vllm_dtype": "bfloat16",
+        "engine_dtype": "bfloat16",
         "weight_sync_interval": 2,
         "vllm_min_version": "0.6.4",
         "vllm_enforce_eager": true,
diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh
index 6ad847954..f39d659ec 100644
--- a/training/opsd/scripts/train_opsd_vllm.sh
+++ b/training/opsd/scripts/train_opsd_vllm.sh
@@ -8,17 +8,20 @@
 #
 # The vLLM server is started **lazily** as a subprocess by training rank 0
 # on first use, so no separate vLLM launch step is required.  The GPUs
-# listed in ``rollout.gpus`` in the config are assigned to the vLLM server
-# via ``CUDA_VISIBLE_DEVICES`` in the subprocess environment.
+# assigned to the vLLM server are controlled by the ROLLOUT_VISIBLE_DEVICE
+# environment variable (comma-separated CUDA device indices).  The training
+# ranks must run on a *different* set of GPUs so the two don't contend for
+# memory.
 #
-# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7
-# run vLLM with TP=2.  Adjust configs/opsd_vllm_disjoint.json::rollout.gpus
-# and NUM_TRAIN_GPUS to match your topology.
+# Default topology: ranks 0..5 train on GPUs 0-5 (ZeRO-3), devices 6-7
+# run vLLM with TP=2.  Override via:
+#   ROLLOUT_VISIBLE_DEVICE=... NUM_TRAIN_GPUS=.. INCLUDE_GPUS=.. bash ...
 set -euo pipefail
 
 CONFIG="${1:-configs/opsd_vllm_disjoint.json}"
 NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}"
 INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}"
+export ROLLOUT_VISIBLE_DEVICE="${ROLLOUT_VISIBLE_DEVICE:-6,7}"
 
 deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \
     main.py --config "${CONFIG}"

From 7580c2805bc62a7f201bc8e9cb7c5ca820703787 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 17:51:45 +0800
Subject: [PATCH 3/8] Remove vLLM path, absorb
 trainer/config/losses/utils/benchmarks from DeepSpeed

- Delete vLLM configs, scripts (opsd_vllm_disjoint.json, smoke_vllm.json, train_opsd_vllm.sh)
- Add trainer.py, config.py, losses.py, utils.py (moved from DeepSpeed)
- Add benchmarks/ (5 hybrid engine benchmarks moved from DeepSpeed)
- Update main.py imports (trainer, config now local)
- Update test imports (losses, utils now local)
- Rewrite README (remove all vLLM sections)

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/README.md                       | 153 +++----------
 training/opsd/benchmarks/bench_14b_rollout.py | 134 +++++++++++
 training/opsd/benchmarks/bench_autotp_gc.py   |  96 ++++++++
 training/opsd/benchmarks/bench_decode_1p1r.py | 180 +++++++++++++++
 training/opsd/benchmarks/bench_hybrid_tp.py   | 145 ++++++++++++
 .../opsd/benchmarks/bench_hybrid_tp_opt.py    | 149 +++++++++++++
 training/opsd/config.py                       | 104 +++++++++
 training/opsd/configs/opsd_vllm_disjoint.json |  53 -----
 training/opsd/configs/smoke_vllm.json         |  56 -----
 training/opsd/losses.py                       | 192 ++++++++++++++++
 training/opsd/main.py                         |   4 +-
 training/opsd/scripts/train_opsd_vllm.sh      |  27 ---
 training/opsd/tests/test_losses.py            |   4 +-
 training/opsd/tests/test_teacher_caching.py   |   2 +-
 training/opsd/trainer.py                      | 210 ++++++++++++++++++
 training/opsd/utils.py                        |  52 +++++
 16 files changed, 1295 insertions(+), 266 deletions(-)
 create mode 100644 training/opsd/benchmarks/bench_14b_rollout.py
 create mode 100644 training/opsd/benchmarks/bench_autotp_gc.py
 create mode 100644 training/opsd/benchmarks/bench_decode_1p1r.py
 create mode 100644 training/opsd/benchmarks/bench_hybrid_tp.py
 create mode 100644 training/opsd/benchmarks/bench_hybrid_tp_opt.py
 create mode 100644 training/opsd/config.py
 delete mode 100644 training/opsd/configs/opsd_vllm_disjoint.json
 delete mode 100644 training/opsd/configs/smoke_vllm.json
 create mode 100644 training/opsd/losses.py
 delete mode 100644 training/opsd/scripts/train_opsd_vllm.sh
 create mode 100644 training/opsd/trainer.py
 create mode 100644 training/opsd/utils.py

diff --git a/training/opsd/README.md b/training/opsd/README.md
index f0fcc6d99..392e2c3ce 100644
--- a/training/opsd/README.md
+++ b/training/opsd/README.md
@@ -11,7 +11,7 @@ step has three phases:
 ```
 ┌────────────┐   prompts   ┌──────────────────┐   prompt+response   ┌────────────┐
 │ Dataloader │ ──────────▶ │ Student rollout  │ ──────────────────▶ │  Teacher   │
-└────────────┘             │ (hybrid / vLLM)  │                     │  forward   │
+└────────────┘             │ (hybrid engine)  │                     │  forward   │
                            └──────────────────┘                     └─────┬──────┘
                                                                           │ logits → CPU cache
                                                                           ▼
@@ -30,29 +30,17 @@ co-resides with the student logits on the training device.
 ## Layout
 
 ```
-examples/opsd/
+training/opsd/
 ├── main.py                            # entry point (deepspeed launcher)
-├── opsd/
-│   ├── config.py                      # OPSDConfig dataclass + JSON loader
-│   ├── losses.py                      # chunked / streamed KL & JSD
-│   ├── teacher.py                     # frozen teacher + CPU logit cache
-│   ├── trainer.py                     # three-phase training loop
-│   ├── data.py                        # JSONL prompt dataset + left-pad collator
-│   ├── utils.py                       # response-mask + shift helpers
-│   └── rollout/
-│       ├── base.py                    # RolloutEngine ABC, request/batch dataclasses
-│       ├── hybrid_engine.py           # DeepSpeed hybrid-engine rollout
-│       └── vllm.py                    # vLLM rollout on disjoint GPUs
+├── trainer.py                         # three-phase OPSD training loop
 ├── configs/
 │   ├── ds_zero3.json                  # base DeepSpeed ZeRO-3 + hybrid engine
 │   ├── opsd_hybrid_engine.json        # production-ish hybrid-engine OPSD config
-│   ├── opsd_vllm_disjoint.json        # vLLM rollout on a disjoint GPU group
 │   ├── smoke_hybrid.json              # 5-step smoke test with Qwen2.5-0.5B / 1.5B
-│   ├── smoke_vllm.json                # same but with vLLM rollout
 │   └── smoke_ds_zero3.json            # ZeRO-3 config tuned for smoke runs
+├── benchmarks/                        # rollout / decode micro-benchmarks
 ├── scripts/
-│   ├── train_opsd_hybrid.sh           # launch hybrid-engine training
-│   └── train_opsd_vllm.sh             # launch vLLM training
+│   └── train_opsd_hybrid.sh           # launch hybrid-engine training
 └── tests/                             # CPU-only unit tests (run with pytest)
 ```
 
@@ -62,58 +50,35 @@ examples/opsd/
 
 ```
 pip install deepspeed transformers datasets accelerate
-# Optional, only for the vLLM rollout backend:
-pip install 'vllm>=0.6.4'
 ```
 
-### Hybrid-engine training (single-node, no vLLM)
+### Hybrid-engine training
 
 ```
-cd examples/opsd
+cd training/opsd
 NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json
 ```
 
 The hybrid engine path lives entirely within DeepSpeed: the student engine
-both trains and generates, sharing weights without a copy step. Easiest to
-get running; slower generation than vLLM.
-
-### vLLM training (disjoint GPU group)
-
-```
-cd examples/opsd
-# Train on GPUs 0..5, run vLLM on 6,7 (matches default config)
-NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \
-    bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json
-```
-
-vLLM gets dedicated GPUs via the `ROLLOUT_VISIBLE_DEVICE` environment
-variable (comma-separated CUDA device indices, e.g.
-`ROLLOUT_VISIBLE_DEVICE=6,7`). Training rank 0 spawns the vLLM server as
-a subprocess with `CUDA_VISIBLE_DEVICES` set to those devices; other
-training ranks receive generated token ids via NCCL broadcast.
+both trains and generates, sharing weights without a copy step.
 
 ### Smoke tests (5 steps, small models)
 
-The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B
+The `smoke_hybrid.json` config runs on 2 GPUs in a few minutes with Qwen2.5-0.5B
 (student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated
 end-to-end before scaling up.
 
 ```
-cd examples/opsd
+cd training/opsd
 deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json
-# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM):
-NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 ROLLOUT_VISIBLE_DEVICE=2,3 \
-    deepspeed --num_gpus 2 --include localhost:0,1 \
-    main.py --config configs/smoke_vllm.json
 ```
 
 ## Unit tests
 
-The CPU-runnable test suite exercises the loss math, teacher caching, rollout
-contract, and vLLM stitch logic. Run with:
+The CPU-runnable test suite exercises the loss math and teacher caching. Run with:
 
 ```
-cd examples/opsd
+cd training/opsd
 python -m pytest tests/ -v
 ```
 
@@ -123,9 +88,9 @@ python -m pytest tests/ -v
 
 ```json
 {
-  "student":    { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" },
+  "student":    { "model_name_or_path": "...", "dtype": "bfloat16" },
   "teacher":    { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true },
-  "rollout":    { "engine": "hybrid_engine | vllm", ... },
+  "rollout":    { "engine": "hybrid_engine", ... },
   "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 },
   "training":   { "train_batch_size": 8, "learning_rate": 1e-6, ... },
   "data":       { "path": "data/prompts.jsonl", "prompt_field": "prompt" },
@@ -133,21 +98,7 @@ python -m pytest tests/ -v
 }
 ```
 
-See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json`
-for fully-populated examples.
-
-**GPU placement for vLLM rollout:** The GPUs available to the vLLM server
-are controlled by the `ROLLOUT_VISIBLE_DEVICE` environment variable
-(comma-separated CUDA device indices, e.g. `ROLLOUT_VISIBLE_DEVICE=6,7`),
-not by a field in the JSON config. This keeps the vLLM device assignment
-decoupled from the DeepSpeed launcher's own `CUDA_VISIBLE_DEVICES` /
-`--include` flags, which control only the training ranks.
-
-## Adding a new model architecture
-
-No special steps are needed for new model architectures. vLLM's RLHF weight
-transfer API handles TP slicing internally; the caller only needs to send full
-tensors.
+See `configs/opsd_hybrid_engine.json` for a fully-populated example.
 
 ## Design notes
 
@@ -158,65 +109,11 @@ tensors.
   (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU
   one sequence slice at a time so the full tensor never re-materialises.
 
-* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends
-  have very different lifecycles (hybrid engine reads student weights live;
-  vLLM holds its own copy and must be synced) but the trainer should not
-  care. The ABC keeps the trainer engine-agnostic so additional backends
-  (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching
-  the loop.
-
-* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is
-  simpler to debug — failures in vLLM don't take down training and vice
-  versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as
-  a follow-up.
-
-* **Weight sync uses vLLM's RLHF API.** vLLM 0.22.0+ exposes
-  ``/update_weights`` which handles TP slicing internally. The trainer
-  sends full tensors and vLLM distributes them.
-
-## vLLM status
-
-The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but
-not yet usable under the DeepSpeed launcher**. During live validation on
-4× H200 we hit a blocking issue:
-
-> vLLM's worker init calls `new_group(...)` on the global process group as
-> a collective. Under `deepspeed --num_gpus N`, the world is all `N`
-> training ranks but only rank 0 calls into vLLM, so the constructor hangs
-> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 +
-> torch 2.5.1. Standalone vLLM (world size 1) works in seconds.
-
-The fix requires running vLLM in a **separate top-level Python process**
-with its own world, accessed over HTTP/RPC from the trainer — the pattern
-used by TRL and OpenRLHF. That's a larger refactor than fits in this PR;
-the current `VLLMRollout` will be the basis for it once landed.
-
-What's verified for the vLLM path today:
-* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test)
-* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated)
-
-What's **not** verified:
-* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig`
-* `LLM.collective_rpc("load_weights", ...)` weight sync at training time
-
-The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated
-end-to-end on the same hardware.
-
-## Other known limitations (v1)
-
-* **vLLM weight sync (when it works) goes through pickle** —
-  `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`.
-  Expect several seconds per sync on a 7B model. A faster v2 would broadcast
-  tensors via NCCL on a shared trainer↔vLLM process group — see verl's
-  `bucketed_weight_transfer.py` for a reference design.
-* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's
-  slicing math is unit-tested but no live run exists.
-* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not
-  ported. Easy to add: scale `per_tok` by a reward weight in the loss path.
-* **GRPO and other on-policy RL recipes** are out of scope. The
-  `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO
-  trainer would add its own advantage / KL-to-reference logic on top.
-* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed.
+* **Why an abstract `RolloutEngine`?** The ABC keeps the trainer
+  engine-agnostic so additional backends can be added without touching the
+  training loop. DeepSpeed provides the `HybridEngineRollout` implementation;
+  external frameworks may plug in their own.
+
 * **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no
   hybrid-engine inference acceleration), since DeepSpeed's inference policy
   list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15.
@@ -224,9 +121,15 @@ end-to-end on the same hardware.
   model's `generate` directly — correct, just ~3-5x slower than the
   accelerated path.
 
+## Other known limitations
+
+* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not
+  ported. Easy to add: scale `per_tok` by a reward weight in the loss path.
+* **GRPO and other on-policy RL recipes** are out of scope. The
+  `RolloutEngine` abstraction is reusable, but a GRPO trainer would add its
+  own advantage / KL-to-reference logic on top.
+
 ## References
 
 * OPSD reference repo: <https://github.com/HJSang/OPSD_OnPolicyDistillation>
 * DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py`
-* verl rollout / weight-sync design (used as a cross-check):
-  <https://github.com/volcengine/verl/tree/main/verl/workers/rollout/vllm_rollout>
diff --git a/training/opsd/benchmarks/bench_14b_rollout.py b/training/opsd/benchmarks/bench_14b_rollout.py
new file mode 100644
index 000000000..d66c7615d
--- /dev/null
+++ b/training/opsd/benchmarks/bench_14b_rollout.py
@@ -0,0 +1,134 @@
+"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
+import time
+import os
+import sys
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL = "Qwen/Qwen2.5-14B-Instruct"
+MAX_NEW_TOKENS = 256
+N_SAMPLES = 1
+CB_SIZE = 1
+N_RUNS = 5
+PROMPT = "def fibonacci(n):"
+
+
+def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=cb_size,
+        use_graph_capture=use_graph_capture,
+    )
+
+    ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(
+        max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95,
+        n_samples_per_prompt=N_SAMPLES
+    )
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    total_toks = 0
+    for i in range(N_RUNS):
+        torch.manual_seed(42 + i)
+        engine.eval()
+        torch.cuda.synchronize()
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()
+        times.append(time.time() - t0)
+        engine.train()
+
+    # Count tokens from last run
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+
+    if rank == 0:
+        print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s  "
+              f"runs={[f'{t*1000:.0f}' for t in times]}")
+
+    return total_toks, t_avg
+
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    world_size = torch.distributed.get_world_size()
+    tp_size = world_size  # all GPUs used for TP
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
+
+    ds_config = {
+        "bf16": {"enabled": True},
+        "zero_optimization": {"stage": 0},
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    if tp_size > 1:
+        ds_config["tensor_parallel"] = {
+            "autotp_size": tp_size,
+            "preset_model": "qwen2",
+            "tp": {"tp_size": tp_size},
+        }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}")
+        print(f"{'='*60}")
+
+    # 1P1R without graph capture (CB=1, no GC)
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
+            import traceback; traceback.print_exc()
+
+    # 1P1R with CUDA graph capture
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
+            import traceback; traceback.print_exc()
+
+    if rank == 0:
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/benchmarks/bench_autotp_gc.py b/training/opsd/benchmarks/bench_autotp_gc.py
new file mode 100644
index 000000000..c9a245b24
--- /dev/null
+++ b/training/opsd/benchmarks/bench_autotp_gc.py
@@ -0,0 +1,96 @@
+"""Benchmark rollout with AutoTP + graph capture on 14B model."""
+import time
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+
+    model_name = "Qwen/Qwen2.5-14B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, dtype=torch.bfloat16, trust_remote_code=True
+    )
+
+    ds_config = {
+        "bf16": {"enabled": True},
+        "zero_optimization": {"stage": 0},
+        "tensor_parallel": {
+            "autotp_size": 2,
+            "preset_model": "qwen2",
+            "tp": {"tp_size": 2},
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": 2,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=2,
+        use_graph_capture=True,
+    )
+
+    # Prepare prompt
+    prompt = "def fibonacci(n):"
+    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4)
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    for i in range(5):
+        torch.manual_seed(42)
+        engine.eval()
+        torch.cuda.synchronize()
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()
+        times.append(time.time() - t0)
+        engine.train()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+    # Count tokens
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    total_toks = 0
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {model_name}")
+        print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256")
+        print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms")
+        print(f"Total response tokens: {total_toks}")
+        print(f"Throughput: {total_toks/t_avg:.1f} tok/s")
+        print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}")
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/benchmarks/bench_decode_1p1r.py b/training/opsd/benchmarks/bench_decode_1p1r.py
new file mode 100644
index 000000000..58fb667d4
--- /dev/null
+++ b/training/opsd/benchmarks/bench_decode_1p1r.py
@@ -0,0 +1,180 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Micro-benchmark for 1p1r HybridEngineRollout decode.
+
+Measures time breakdown of each decode step:
+  - model forward (attention + FFN)
+  - sampling (softmax + multinomial)
+  - Python overhead (mask concat, state update, etc.)
+
+Usage:
+  python examples/opsd/bench_decode_1p1r.py --model Qwen/Qwen2.5-0.5B-Instruct
+"""
+
+import argparse
+import time
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_decode_raw(model, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
+    """Raw decode loop benchmark — measures each component separately."""
+    model.eval()
+    model_dtype = next(model.parameters()).dtype
+
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+
+    results = {
+        "prompt_len": prompt_len,
+        "max_new_tokens": max_new_tokens,
+        "model_dtype": str(model_dtype),
+    }
+
+    timings = {"prefill": [], "decode_forward": [], "sampling": [], "overhead": [], "total": []}
+
+    for _ in range(num_warmup + num_iters):
+        with torch.no_grad():
+            t0 = time.perf_counter()
+            out = model(input_ids, attention_mask=attn_mask, use_cache=True)
+            past = out.past_key_values
+            logits = out.logits[:, -1:, :]
+            t_prefill = time.perf_counter()
+
+            generated = []
+            cur_token = logits.argmax(dim=-1)
+            generated.append(cur_token)
+            cur_mask = attn_mask
+
+            decode_times = []
+            sample_times = []
+            overhead_times = []
+
+            for step in range(max_new_tokens):
+                t_step = time.perf_counter()
+                cur_mask = torch.cat([cur_mask, torch.ones(1, 1, dtype=torch.long, device=device)], dim=1)
+                pos_ids = torch.tensor([[prompt_len + step]], device=device)
+
+                t_fwd = time.perf_counter()
+                out = model(cur_token,
+                            attention_mask=cur_mask,
+                            position_ids=pos_ids,
+                            past_key_values=past,
+                            use_cache=True)
+                past = out.past_key_values
+                t_fwd_end = time.perf_counter()
+
+                next_logits = out.logits[:, -1, :]
+                probs = torch.softmax(next_logits / 1.0, dim=-1)
+                cur_token = torch.multinomial(probs, 1)
+                t_sample = time.perf_counter()
+
+                generated.append(cur_token)
+                t_overhead = time.perf_counter()
+
+                decode_times.append(t_fwd_end - t_fwd)
+                sample_times.append(t_sample - t_fwd_end)
+                overhead_times.append(t_overhead - t_sample)
+
+            t_total = time.perf_counter()
+
+        timings["prefill"].append(t_prefill - t0)
+        timings["decode_forward"].append(decode_times)
+        timings["sampling"].append(sample_times)
+        timings["overhead"].append(overhead_times)
+        timings["total"].append(t_total - t0)
+
+    import numpy as np
+
+    def avg_last_n(lst, n):
+        return np.mean(lst[-n:])
+
+    def avg_of_avg(list_of_lists, n):
+        arrs = [np.array(ls[-n:]) for ls in list_of_lists]
+        return np.mean([a.mean() for a in arrs])
+
+    results["prefill_ms"] = avg_last_n(timings["prefill"], num_iters) * 1000
+    results["decode_forward_ms_per_step"] = avg_of_avg(timings["decode_forward"], num_iters) * 1000
+    results["sampling_ms_per_step"] = avg_of_avg(timings["sampling"], num_iters) * 1000
+    results["overhead_ms_per_step"] = avg_of_avg(timings["overhead"], num_iters) * 1000
+    results["total_ms"] = avg_last_n(timings["total"], num_iters) * 1000
+    results["decode_steps_total_ms"] = results["decode_forward_ms_per_step"] * max_new_tokens
+    results["sampling_total_ms"] = results["sampling_ms_per_step"] * max_new_tokens
+    results["overhead_total_ms"] = results["overhead_ms_per_step"] * max_new_tokens
+
+    return results
+
+
+def bench_hybrid_rollout(rollout, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
+    """Benchmark the full HybridEngineRollout.generate() path."""
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for _ in range(num_warmup + num_iters):
+        get_accelerator().synchronize()  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            rollout.generate(request, sampling)
+        get_accelerator().synchronize()  #ignore-cuda
+        times.append(time.perf_counter() - t0)
+
+    import numpy as np
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    args = parser.parse_args()
+
+    device = get_accelerator().current_device()  #ignore-cuda
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16).to(device)
+
+    print(f"=== Raw decode loop benchmark (model={args.model}) ===")
+    raw = bench_decode_raw(model, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
+                           args.num_iters)
+    print(f"  Prefill:              {raw['prefill_ms']:.2f} ms")
+    print(
+        f"  Decode forward/step:  {raw['decode_forward_ms_per_step']:.3f} ms  (total: {raw['decode_steps_total_ms']:.1f} ms)"
+    )
+    print(f"  Sampling/step:        {raw['sampling_ms_per_step']:.3f} ms  (total: {raw['sampling_total_ms']:.1f} ms)")
+    print(f"  Overhead/step:        {raw['overhead_ms_per_step']:.3f} ms  (total: {raw['overhead_total_ms']:.1f} ms)")
+    print(f"  Total:                {raw['total_ms']:.1f} ms")
+
+    print(f"\n=== HybridEngineRollout benchmark ===")
+    rollout = HybridEngineRollout(model, tokenizer)
+    rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
+                              args.num_iters)
+    print(f"  Rollout generate:     {rr['rollout_total_ms']:.1f} ms")
+
+    print(f"\n=== Summary ===")
+    print(f"  Raw decode loop:      {raw['total_ms']:.1f} ms")
+    print(f"  HybridEngine rollout: {rr['rollout_total_ms']:.1f} ms")
+    print(f"  Overhead (rollout - raw): {rr['rollout_total_ms'] - raw['total_ms']:.1f} ms")
+    print(
+        f"  Bottleneck: decode forward = {raw['decode_forward_ms_per_step']:.3f} ms/step x {args.max_new_tokens} steps = {raw['decode_steps_total_ms']:.1f} ms"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/benchmarks/bench_hybrid_tp.py b/training/opsd/benchmarks/bench_hybrid_tp.py
new file mode 100644
index 000000000..3f41150c7
--- /dev/null
+++ b/training/opsd/benchmarks/bench_hybrid_tp.py
@@ -0,0 +1,145 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2).
+
+Usage:
+    deepspeed --num_gpus 2 bench_hybrid_tp.py \
+        --model Qwen/Qwen2.5-14B-Instruct \
+        --max-new-tokens 64
+"""
+
+import argparse
+import os
+import time
+
+import deepspeed
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    torch.manual_seed(42)
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for i in range(num_warmup + num_iters):
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            result = rollout.generate(request, sampling)
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        if local_rank == 0:
+            label = "warmup" if i < num_warmup else "iter"
+            n_tokens = result.input_ids.shape[-1] - prompt_len
+            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
+
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    local_rank = args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    deepspeed.init_distributed()
+
+    if local_rank == 0:
+        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===")
+        print(f"  Model:       {args.model}")
+        print(f"  TP size:     {world_size}")
+        print(f"  Prompt len:  {args.prompt_len}")
+        print(f"  Decode len:  {args.max_new_tokens}")
+        print(f"  Warmup:      {args.num_warmup}")
+        print(f"  Iters:       {args.num_iters}")
+        print()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+    )
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "tensor_parallel": {
+            "autotp_size": world_size,
+            "preset_model": "qwen2",
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(
+        model=model,
+        optimizer=None,
+        model_parameters=model.parameters(),
+        config=ds_config,
+    )
+
+    if local_rank == 0:
+        print("  DeepSpeed engine initialized.")
+        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        print(f"  Parameters (local):  {param_count:.2f}B")
+        print(f"  GPU mem allocated:   {alloc:.1f} GB")
+        print()
+
+    rollout = HybridEngineRollout(engine, tokenizer)
+
+    if local_rank == 0:
+        print("  Running benchmark...")
+
+    result = bench_hybrid_rollout(
+        rollout,
+        tokenizer,
+        args.prompt_len,
+        args.max_new_tokens,
+        args.num_warmup,
+        args.num_iters,
+    )
+
+    if local_rank == 0:
+        total = result["rollout_total_ms"]
+        per_step = total / args.max_new_tokens
+        throughput = 1000.0 / per_step
+        print()
+        print(f"=== Results ===")
+        print(f"  Total generate:   {total:.1f} ms")
+        print(f"  Per decode step:  {per_step:.2f} ms")
+        print(f"  Throughput:       {throughput:.1f} tokens/s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/benchmarks/bench_hybrid_tp_opt.py b/training/opsd/benchmarks/bench_hybrid_tp_opt.py
new file mode 100644
index 000000000..d7fae2dde
--- /dev/null
+++ b/training/opsd/benchmarks/bench_hybrid_tp_opt.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer.
+
+Usage:
+    deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \
+        --model Qwen/Qwen2.5-14B-Instruct \
+        --max-new-tokens 64
+"""
+
+import argparse
+import os
+import time
+
+import deepspeed
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    torch.manual_seed(42)
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for i in range(num_warmup + num_iters):
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            result = rollout.generate(request, sampling)
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        if local_rank == 0:
+            label = "warmup" if i < num_warmup else "iter"
+            n_tokens = result.input_ids.shape[-1] - prompt_len
+            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
+
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    local_rank = args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    deepspeed.init_distributed()
+
+    if local_rank == 0:
+        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===")
+        print(f"  Model:       {args.model}")
+        print(f"  TP size:     {world_size}")
+        print(f"  Prompt len:  {args.prompt_len}")
+        print(f"  Decode len:  {args.max_new_tokens}")
+        print()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+    )
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "tensor_parallel": {
+            "autotp_size": world_size,
+            "preset_model": "qwen2",
+        },
+    }
+
+    engine, _, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config,
+    )
+
+    if local_rank == 0:
+        print("  DeepSpeed engine initialized (with optimizer).")
+        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  Parameters (local):  {param_count:.2f}B")
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  GPU mem allocated:   {alloc:.1f} GB")
+        print(f"  GPU mem reserved:    {reserv:.1f} GB")
+        print()
+
+    rollout = HybridEngineRollout(engine, tokenizer)
+
+    if local_rank == 0:
+        print("  Running benchmark...")
+
+    result = bench_hybrid_rollout(
+        rollout,
+        tokenizer,
+        args.prompt_len,
+        args.max_new_tokens,
+        args.num_warmup,
+        args.num_iters,
+    )
+
+    if local_rank == 0:
+        total = result["rollout_total_ms"]
+        per_step = total / args.max_new_tokens
+        throughput = 1000.0 / per_step
+        print()
+        print(f"=== Results ===")
+        print(f"  Total generate:   {total:.1f} ms")
+        print(f"  Per decode step:  {per_step:.2f} ms")
+        print(f"  Throughput:       {throughput:.1f} tokens/s")
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  GPU mem (final):   alloc={alloc:.1f} GB, reserved={reserv:.1f} GB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/opsd/config.py b/training/opsd/config.py
new file mode 100644
index 000000000..66ff7e21e
--- /dev/null
+++ b/training/opsd/config.py
@@ -0,0 +1,104 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""OPSD application configuration.
+
+``OPSDConfig`` is loaded from JSON and threads through the entire pipeline.
+The ``rollout`` sub-config is consumed by DeepSpeed's rollout engine; the
+rest is application-level (trainer, data, distillation).
+"""
+
+import json
+from dataclasses import dataclass, field
+from typing import Optional
+
+from deepspeed.runtime.rollout import RolloutConfig
+
+
+@dataclass
+class StudentConfig:
+    model_name_or_path: str
+    dtype: str = "bfloat16"
+    trust_remote_code: bool = False
+
+
+@dataclass
+class TeacherConfig:
+    model_name_or_path: str
+    dtype: str = "bfloat16"
+    trust_remote_code: bool = False
+    offload_to_cpu: bool = True
+
+
+@dataclass
+class DistillationConfig:
+    # "forward_kl" | "reverse_kl" | "jsd"
+    loss_type: str = "reverse_kl"
+    temperature: float = 1.0
+    chunk_size: int = 512
+
+
+@dataclass
+class TrainingConfig:
+    train_batch_size: int = 8
+    micro_batch_size_per_gpu: int = 1
+    gradient_accumulation_steps: int = 1
+    learning_rate: float = 1e-6
+    weight_decay: float = 0.0
+    num_train_epochs: int = 1
+    max_steps: int = -1
+    warmup_steps: int = 0
+    save_steps: int = 500
+    logging_steps: int = 10
+    save_dir: str = "./opsd_ckpt"
+    seed: int = 42
+
+
+@dataclass
+class DataConfig:
+    path: str = ""
+    prompt_field: str = "prompt"
+    chat_template: Optional[str] = None
+    shuffle: bool = True
+
+
+@dataclass
+class OPSDConfig:
+    student: StudentConfig
+    teacher: TeacherConfig
+    rollout: RolloutConfig = field(default_factory=RolloutConfig)
+    distillation: DistillationConfig = field(default_factory=DistillationConfig)
+    training: TrainingConfig = field(default_factory=TrainingConfig)
+    data: DataConfig = field(default_factory=DataConfig)
+    deepspeed_config: str = ""
+
+    @classmethod
+    def from_json(cls, path: str) -> "OPSDConfig":
+        with open(path, "r") as f:
+            raw = json.load(f)
+        return cls.from_dict(raw)
+
+    @classmethod
+    def from_dict(cls, raw: dict) -> "OPSDConfig":
+        return cls(
+            student=StudentConfig(**raw["student"]),
+            teacher=TeacherConfig(**raw["teacher"]),
+            rollout=RolloutConfig(**raw.get("rollout", {})),
+            distillation=DistillationConfig(**raw.get("distillation", {})),
+            training=TrainingConfig(**raw.get("training", {})),
+            data=DataConfig(**raw.get("data", {})),
+            deepspeed_config=raw.get("deepspeed_config", ""),
+        )
+
+    def to_dict(self) -> dict:
+        from dataclasses import asdict
+        return asdict(self)
+
+    def validate(self) -> None:
+        if self.distillation.loss_type not in ("forward_kl", "reverse_kl", "jsd"):
+            raise ValueError(f"Unknown loss_type {self.distillation.loss_type!r}")
+        if self.rollout.engine != "hybrid_engine":
+            raise ValueError(f"Unknown rollout engine {self.rollout.engine!r}; expected 'hybrid_engine'")
+        if self.distillation.chunk_size <= 0:
+            raise ValueError("distillation.chunk_size must be positive")
diff --git a/training/opsd/configs/opsd_vllm_disjoint.json b/training/opsd/configs/opsd_vllm_disjoint.json
deleted file mode 100644
index 63c1ecc03..000000000
--- a/training/opsd/configs/opsd_vllm_disjoint.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": true
-    },
-    "rollout": {
-        "engine": "vllm",
-        "max_prompt_length": 1024,
-        "max_response_length": 1024,
-        "temperature": 0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "tensor_parallel_size": 2,
-        "gpu_memory_utilization": 0.85,
-        "engine_dtype": "bfloat16",
-        "weight_sync_interval": 4,
-        "vllm_min_version": "0.6.4",
-        "vllm_port": 8000
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 0,
-        "chunk_size": 512
-    },
-    "training": {
-        "train_batch_size": 1,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": -1,
-        "warmup_steps": 0,
-        "save_steps": 500,
-        "logging_steps": 10,
-        "save_dir": "./opsd_ckpt_vllm",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/ds_zero3.json"
-}
diff --git a/training/opsd/configs/smoke_vllm.json b/training/opsd/configs/smoke_vllm.json
deleted file mode 100644
index bda234ca0..000000000
--- a/training/opsd/configs/smoke_vllm.json
+++ /dev/null
@@ -1,56 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": false
-    },
-    "rollout": {
-        "engine": "vllm",
-        "max_prompt_length": 128,
-        "max_response_length": 64,
-        "temperature": 0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.3,
-        "engine_dtype": "bfloat16",
-        "weight_sync_interval": 2,
-        "vllm_min_version": "0.6.4",
-        "vllm_enforce_eager": true,
-        "vllm_port": 8000,
-        "vllm_python": "/root/miniconda3/envs/vllm/bin/python",
-        "weight_transfer_backend": "gdr"
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 0,
-        "chunk_size": 128
-    },
-    "training": {
-        "train_batch_size": 1,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": 5,
-        "warmup_steps": 0,
-        "save_steps": 10000,
-        "logging_steps": 1,
-        "save_dir": "./opsd_smoke_vllm_ckpt",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/smoke_ds_zero0.json"
-}
diff --git a/training/opsd/losses.py b/training/opsd/losses.py
new file mode 100644
index 000000000..d9f4b9266
--- /dev/null
+++ b/training/opsd/losses.py
@@ -0,0 +1,192 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Per-token distillation divergences with sequence-axis chunking.
+
+The full ``[B, T, V]`` tensor produced by a forward pass on a modern LLM can
+easily exceed several GB in fp32 (e.g. 8 * 1024 * 150k * 4 B ~ 4.9 GB). Holding
+both student *and* teacher logits at once would double that. We chunk along the
+sequence axis so the per-chunk softmax + difference only ever needs
+``[B, chunk, V]`` of working memory, regardless of T.
+
+Math conventions:
+    * ``forward_kl``  = D_KL(teacher || student) — mode-covering for student
+    * ``reverse_kl``  = D_KL(student || teacher) — mode-seeking for student
+    * ``jsd``         = 0.5 * D_KL(P || M) + 0.5 * D_KL(Q || M), M = (P+Q)/2
+
+All three follow the standard knowledge-distillation temperature convention:
+divide logits by T before softmax, then multiply the result by T**2 so that
+gradient magnitudes are comparable across temperatures.
+"""
+
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+
+
+def _forward_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    t_probs = t_log_probs.exp()
+    kl = (t_probs * (t_log_probs - s_log_probs)).sum(dim=-1)
+    return kl * (temperature**2)
+
+
+def _reverse_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    s_probs = s_log_probs.exp()
+    kl = (s_probs * (s_log_probs - t_log_probs)).sum(dim=-1)
+    return kl * (temperature**2)
+
+
+def _jsd(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    s_probs = s_log_probs.exp()
+    t_probs = t_log_probs.exp()
+    m_probs = 0.5 * (s_probs + t_probs)
+    # Clamp guards against log(0) when both distributions have ~0 mass on the
+    # same vocab id (rare in practice but possible after temperature scaling).
+    m_log_probs = m_probs.clamp_min(1e-12).log()
+    kl_s = (s_probs * (s_log_probs - m_log_probs)).sum(dim=-1)
+    kl_t = (t_probs * (t_log_probs - m_log_probs)).sum(dim=-1)
+    return 0.5 * (kl_s + kl_t) * (temperature**2)
+
+
+_LOSS_FNS: "dict[str, Callable[..., torch.Tensor]]" = {
+    "forward_kl": _forward_kl,
+    "reverse_kl": _reverse_kl,
+    "jsd": _jsd,
+}
+
+
+def chunked_distillation_loss(
+    student_logits: torch.Tensor,
+    teacher_logits: torch.Tensor,
+    response_mask: torch.Tensor,
+    loss_type: str = "reverse_kl",
+    temperature: float = 1.0,
+    chunk_size: int = 512,
+) -> torch.Tensor:
+    """Mean per-token divergence over response positions, chunked over the
+    sequence axis to bound peak memory.
+
+    Args:
+        student_logits: ``[B, T, V]`` — gradient flows here.
+        teacher_logits: ``[B, T, V]`` — caller is responsible for ``detach()``
+            (we do not detach here so the function stays cheap).
+        response_mask:  ``[B, T]`` — 1 where the position should contribute to
+            the loss (i.e. response tokens, not prompt or padding), 0 elsewhere.
+        loss_type:      ``"forward_kl"`` | ``"reverse_kl"`` | ``"jsd"``.
+        temperature:    KD temperature; >1 softens both distributions.
+        chunk_size:     Sequence-axis chunk size.
+
+    Returns:
+        Scalar loss = sum-over-positions(per_tok * mask) / sum(mask), promoted
+        to fp32 internally for numerical stability.
+    """
+    if loss_type not in _LOSS_FNS:
+        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
+    fn = _LOSS_FNS[loss_type]
+
+    if student_logits.shape != teacher_logits.shape:
+        raise ValueError(f"shape mismatch: student {tuple(student_logits.shape)} vs teacher "
+                         f"{tuple(teacher_logits.shape)}")
+    B, T, _ = student_logits.shape
+    if response_mask.shape != (B, T):
+        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
+                         f"prefix ({B}, {T})")
+
+    mask_f = response_mask.to(torch.float32)
+    total_tokens = mask_f.sum().clamp_min(1.0)
+    total_loss = student_logits.new_zeros((), dtype=torch.float32)
+
+    for start in range(0, T, chunk_size):
+        end = min(start + chunk_size, T)
+        chunk_mask = mask_f[:, start:end]
+        # Skipping empty chunks avoids a redundant forward through the softmax
+        # path on chunks that wouldn't contribute anything to the sum.
+        if chunk_mask.sum().item() == 0:
+            continue
+        per_tok = fn(
+            student_logits[:, start:end].float(),
+            teacher_logits[:, start:end].float(),
+            temperature,
+        )
+        total_loss = total_loss + (per_tok * chunk_mask).sum()
+
+    return total_loss / total_tokens
+
+
+def streamed_distillation_loss(
+    student_logits: torch.Tensor,
+    teacher_chunk_fetcher: Callable[[int, int], torch.Tensor],
+    response_mask: torch.Tensor,
+    loss_type: str = "reverse_kl",
+    temperature: float = 1.0,
+    chunk_size: int = 512,
+) -> torch.Tensor:
+    """Same math as :func:`chunked_distillation_loss`, but teacher logits are
+    pulled chunk-by-chunk via a fetcher so the full ``[B, T, V]`` teacher
+    tensor never needs to live on the same device as the student.
+
+    Args:
+        student_logits: ``[B, T, V]`` on the training device.
+        teacher_chunk_fetcher: ``fn(start, end) -> [B, end - start, V]``, already
+            on the same device and broadcastable dtype as ``student_logits``.
+            Typically wraps ``TeacherLogitCache.chunk_to_device``.
+        response_mask:  ``[B, T]`` — 1 where the position should contribute.
+        loss_type:      one of ``"forward_kl" | "reverse_kl" | "jsd"``.
+        temperature:    KD temperature.
+        chunk_size:     Sequence-axis chunk size.
+    """
+    if loss_type not in _LOSS_FNS:
+        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
+    fn = _LOSS_FNS[loss_type]
+
+    B, T, _ = student_logits.shape
+    if response_mask.shape != (B, T):
+        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
+                         f"prefix ({B}, {T})")
+
+    mask_f = response_mask.to(torch.float32)
+    total_tokens = mask_f.sum().clamp_min(1.0)
+    total_loss = student_logits.new_zeros((), dtype=torch.float32)
+
+    for start in range(0, T, chunk_size):
+        end = min(start + chunk_size, T)
+        chunk_mask = mask_f[:, start:end]
+        if chunk_mask.sum().item() == 0:
+            continue
+        teacher_chunk = teacher_chunk_fetcher(start, end)
+        if teacher_chunk.shape[1] != (end - start):
+            raise RuntimeError(f"fetcher returned chunk of length {teacher_chunk.shape[1]}, "
+                               f"expected {end - start}")
+        per_tok = fn(
+            student_logits[:, start:end].float(),
+            teacher_chunk.float(),
+            temperature,
+        )
+        total_loss = total_loss + (per_tok * chunk_mask).sum()
+
+    return total_loss / total_tokens
+
+
+def per_token_logprobs(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    """Gather log p(label_t | context_<t) for each position.
+
+    Convenience helper used by the trainer for diagnostic logging (e.g. mean
+    student log-prob of the rollout, mean teacher log-prob of the rollout).
+
+    Args:
+        logits: ``[B, T, V]``.
+        labels: ``[B, T]`` token ids.
+
+    Returns:
+        ``[B, T]`` of log-probabilities.
+    """
+    log_probs = F.log_softmax(logits.float(), dim=-1)
+    return log_probs.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
diff --git a/training/opsd/main.py b/training/opsd/main.py
index 534c8ae0a..62298829a 100644
--- a/training/opsd/main.py
+++ b/training/opsd/main.py
@@ -24,11 +24,11 @@
 from torch.utils.data import DataLoader
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from deepspeed.runtime.rlhf.config import OPSDConfig
+from config import OPSDConfig
 from deepspeed.runtime.rlhf.data import LeftPaddedPromptCollator, PromptDataset
 from deepspeed.runtime.rollout import build_rollout
 from deepspeed.runtime.rlhf.teacher import TeacherWrapper
-from deepspeed.runtime.rlhf.trainer.opsd import OPSDTrainer
+from trainer import OPSDTrainer
 
 
 def _seed_everything(seed: int) -> None:
diff --git a/training/opsd/scripts/train_opsd_vllm.sh b/training/opsd/scripts/train_opsd_vllm.sh
deleted file mode 100644
index f39d659ec..000000000
--- a/training/opsd/scripts/train_opsd_vllm.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-#
-# Launch OPSD training with vLLM rollout.
-#
-# The vLLM server is started **lazily** as a subprocess by training rank 0
-# on first use, so no separate vLLM launch step is required.  The GPUs
-# assigned to the vLLM server are controlled by the ROLLOUT_VISIBLE_DEVICE
-# environment variable (comma-separated CUDA device indices).  The training
-# ranks must run on a *different* set of GPUs so the two don't contend for
-# memory.
-#
-# Default topology: ranks 0..5 train on GPUs 0-5 (ZeRO-3), devices 6-7
-# run vLLM with TP=2.  Override via:
-#   ROLLOUT_VISIBLE_DEVICE=... NUM_TRAIN_GPUS=.. INCLUDE_GPUS=.. bash ...
-set -euo pipefail
-
-CONFIG="${1:-configs/opsd_vllm_disjoint.json}"
-NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}"
-INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}"
-export ROLLOUT_VISIBLE_DEVICE="${ROLLOUT_VISIBLE_DEVICE:-6,7}"
-
-deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \
-    main.py --config "${CONFIG}"
diff --git a/training/opsd/tests/test_losses.py b/training/opsd/tests/test_losses.py
index 41ea92289..2e28874be 100644
--- a/training/opsd/tests/test_losses.py
+++ b/training/opsd/tests/test_losses.py
@@ -13,8 +13,8 @@
 import pytest
 import torch
 
-from deepspeed.runtime.rlhf.losses import chunked_distillation_loss, per_token_logprobs
-from deepspeed.runtime.rlhf.utils import build_response_mask, shift_for_next_token_prediction
+from losses import chunked_distillation_loss, per_token_logprobs
+from utils import build_response_mask, shift_for_next_token_prediction
 
 
 @pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py
index 36d2fcea8..090aa3ed7 100644
--- a/training/opsd/tests/test_teacher_caching.py
+++ b/training/opsd/tests/test_teacher_caching.py
@@ -85,7 +85,7 @@ def test_streamed_chunked_loss_matches_full_loss():
     """End-to-end check: pulling teacher logits chunk-by-chunk through the
     cache yields the same distillation loss as passing the full teacher tensor
     to ``chunked_distillation_loss`` directly."""
-    from deepspeed.runtime.rlhf.losses import chunked_distillation_loss
+    from losses import chunked_distillation_loss
 
     torch.manual_seed(0)
     s = torch.randn(2, 64, 32)
diff --git a/training/opsd/trainer.py b/training/opsd/trainer.py
new file mode 100644
index 000000000..34e60807c
--- /dev/null
+++ b/training/opsd/trainer.py
@@ -0,0 +1,210 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""On-policy distillation (OPSD) training loop.
+
+Each step is three phases:
+
+  0. **Rollout.** The student generates responses for the batch's prompts
+     (via the configured :class:`~deepspeed.runtime.rollout.RolloutEngine`).
+  1. **Teacher.** The frozen teacher runs a forward over prompt+response. The
+     full logit tensor is parked on the host via
+     :class:`~opsd.teacher.TeacherLogitCache` so teacher GPU buffers can be
+     released before the student backward.
+  2. **Student.** The student runs forward+backward on prompt+response. The
+     loss is the per-token divergence to the teacher, streamed from the
+     host-resident cache one sequence chunk at a time
+     (:func:`~deepspeed.runtime.rlhf.losses.streamed_distillation_loss`), so
+     the full ``[B, T, V]`` teacher tensor never co-resides with the student
+     logits on the training device.
+
+The trainer itself contains no DeepSpeed-specific control flow beyond the
+``backward`` / ``step`` calls on the student engine; backend choice (ZeRO
+stage, offload, hybrid engine) is owned entirely by the DeepSpeed JSON config.
+"""
+
+import os
+import time
+from abc import ABC, abstractmethod
+from typing import Any
+
+import torch
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+
+from config import OPSDConfig
+from losses import streamed_distillation_loss
+from utils import build_response_mask
+from deepspeed.runtime.rollout import RolloutEngine, RolloutRequest, SamplingConfig
+
+
+def _is_rank_zero() -> bool:
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+
+class RLHFTrainer(ABC):
+    """Base class for RLHF training loops."""
+
+    @abstractmethod
+    def train(self) -> None:
+        ...
+
+    @abstractmethod
+    def _train_step(self, batch: Any) -> dict:
+        ...
+
+
+class OPSDTrainer(RLHFTrainer):
+
+    def __init__(
+        self,
+        cfg: OPSDConfig,
+        student_engine: Any,
+        teacher: Any,
+        tokenizer: Any,
+        rollout: RolloutEngine,
+        dataloader: Any,
+    ):
+        self.cfg = cfg
+        self.student_engine = student_engine
+        self.teacher = teacher
+        self.tokenizer = tokenizer
+        self.rollout = rollout
+        self.dataloader = dataloader
+
+        self.device = get_accelerator().current_device_name()
+        self.step = 0
+
+    # ------------------------------------------------------------------
+    # Driver
+    # ------------------------------------------------------------------
+
+    def train(self) -> None:
+        max_steps = self.cfg.training.max_steps
+        for epoch in range(self.cfg.training.num_train_epochs):
+            for batch in self.dataloader:
+                if max_steps > 0 and self.step >= max_steps:
+                    return
+                metrics = self._train_step(batch)
+                self._maybe_log(metrics)
+                self._maybe_save()
+                self.step += 1
+            if max_steps > 0 and self.step >= max_steps:
+                return
+
+    # ------------------------------------------------------------------
+    # One step
+    # ------------------------------------------------------------------
+
+    def _train_step(self, batch) -> dict:
+        t_start = time.time()
+
+        prompt_ids = batch["prompt_ids"].to(self.device, non_blocking=True)
+        prompt_attn = batch["prompt_attention_mask"].to(self.device, non_blocking=True)
+
+        # Sync student weights into the rollout backend.
+        # No-op for hybrid engine; meaningful for vLLM.
+        self.rollout.sync_weights(self.step)
+
+        # --- Phase 0: rollout (student generates responses) ---------------
+        # Switch hybrid engine to inference mode (gathers ZeRO-3 params).
+        self.student_engine.eval()
+        sampling = SamplingConfig(
+            max_new_tokens=self.cfg.rollout.max_response_length,
+            temperature=self.cfg.rollout.temperature,
+            top_p=self.cfg.rollout.top_p,
+            top_k=self.cfg.rollout.top_k,
+            n_samples_per_prompt=self.cfg.rollout.n_samples_per_prompt,
+        )
+        roll = self.rollout.generate(
+            RolloutRequest(prompt_ids=prompt_ids, prompt_attention_mask=prompt_attn),
+            sampling,
+        )
+        input_ids = roll.input_ids.to(self.device, non_blocking=True)
+        attention_mask = roll.attention_mask.to(self.device, non_blocking=True)
+        response_start_idx = roll.response_start_idx.to(self.device, non_blocking=True)
+        response_mask = build_response_mask(response_start_idx, attention_mask)
+        t_rollout = time.time() - t_start
+
+        # --- Phase 1: teacher forward → host-cached logits ----------------
+        t1 = time.time()
+        teacher_cache = self.teacher.forward_to_cache(input_ids, attention_mask)
+        t_teacher = time.time() - t1
+
+        # --- Phase 2: student forward + streamed KL + backward ------------
+        t2 = time.time()
+        self.student_engine.train()
+        outputs = self.student_engine(input_ids=input_ids, attention_mask=attention_mask)
+        student_logits = outputs.logits  # [B, T, V]
+
+        # Shift for next-token prediction: logits at position t predict token
+        # at t+1, so the loss aligns student_logits[:, :-1] with the position
+        # t+1 entries of the response mask.
+        student_logits_shifted = student_logits[:, :-1, :]
+        mask_shifted = response_mask[:, 1:].contiguous()
+
+        def _fetch(start: int, end: int) -> torch.Tensor:
+            # The cache holds *unshifted* teacher logits; for the next-token
+            # objective we ask the cache for positions [start, end) of the
+            # shifted teacher, which is positions [start, end) of the original
+            # since we already lopped off the final column in the student.
+            return teacher_cache.chunk_to_device(start,
+                                                 end,
+                                                 device=student_logits_shifted.device,
+                                                 dtype=student_logits_shifted.dtype)
+
+        loss = streamed_distillation_loss(
+            student_logits=student_logits_shifted,
+            teacher_chunk_fetcher=_fetch,
+            response_mask=mask_shifted,
+            loss_type=self.cfg.distillation.loss_type,
+            temperature=self.cfg.distillation.temperature,
+            chunk_size=self.cfg.distillation.chunk_size,
+        )
+
+        self.student_engine.backward(loss)
+        self.student_engine.step()
+
+        teacher_cache.free()
+        t_student = time.time() - t2
+
+        # Reduce loss across ranks for a clean log line.
+        loss_for_log = loss.detach().clone()
+        if dist.is_initialized():
+            dist.all_reduce(loss_for_log)
+            loss_for_log /= dist.get_world_size()
+
+        return {
+            "loss": float(loss_for_log.item()),
+            "rollout_s": t_rollout,
+            "teacher_s": t_teacher,
+            "student_s": t_student,
+            "step_s": time.time() - t_start,
+            "response_tokens": int(mask_shifted.sum().item()),
+        }
+
+    # ------------------------------------------------------------------
+    # Logging / checkpointing
+    # ------------------------------------------------------------------
+
+    def _maybe_log(self, metrics: dict) -> None:
+        if self.step % self.cfg.training.logging_steps != 0:
+            return
+        if not _is_rank_zero():
+            return
+        print(f"[opsd][step {self.step}] loss={metrics['loss']:.4f} "
+              f"rollout={metrics['rollout_s']:.2f}s teacher={metrics['teacher_s']:.2f}s "
+              f"student={metrics['student_s']:.2f}s step={metrics['step_s']:.2f}s "
+              f"resp_tok={metrics['response_tokens']}")
+
+    def _maybe_save(self) -> None:
+        if self.step == 0:
+            return
+        if self.step % self.cfg.training.save_steps != 0:
+            return
+        tag = f"step_{self.step}"
+        os.makedirs(self.cfg.training.save_dir, exist_ok=True)
+        self.student_engine.save_checkpoint(self.cfg.training.save_dir, tag=tag)
+        if _is_rank_zero():
+            print(f"[opsd] saved checkpoint to {self.cfg.training.save_dir}/{tag}")
diff --git a/training/opsd/utils.py b/training/opsd/utils.py
new file mode 100644
index 000000000..b2954407b
--- /dev/null
+++ b/training/opsd/utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Small tensor/masking helpers shared by trainer, losses, and tests.
+
+These intentionally stay free of DeepSpeed / distributed imports so the
+non-distributed unit tests can exercise them on CPU without a torchrun
+launcher.
+"""
+
+import torch
+
+
+def build_response_mask(response_start_idx: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """Mark positions belonging to the response (not prompt, not padding).
+
+    Args:
+        response_start_idx: ``[B]`` int tensor — the first column index that is
+            part of the response, per sample. For *right-padded* prompts this
+            equals the prompt's token count; for the more common *left-padded*
+            convention used by causal generation it equals the prompt section
+            length (i.e. the column where prompt ends and response begins).
+        attention_mask: ``[B, T]`` — 1 on real tokens (prompt + response), 0 on
+            padding.
+
+    Returns:
+        ``[B, T]`` 0/1 mask with the same dtype as ``attention_mask``. 1 only
+        at positions ``t >= response_start_idx[b]`` that are also attended.
+    """
+    if response_start_idx.dim() != 1:
+        raise ValueError(f"response_start_idx must be 1-D, got shape {tuple(response_start_idx.shape)}")
+    if attention_mask.dim() != 2:
+        raise ValueError(f"attention_mask must be 2-D, got shape {tuple(attention_mask.shape)}")
+    B, T = attention_mask.shape
+    if response_start_idx.shape[0] != B:
+        raise ValueError(f"response_start_idx batch ({response_start_idx.shape[0]}) != "
+                         f"attention_mask batch ({B})")
+
+    pos = torch.arange(T, device=attention_mask.device).unsqueeze(0).expand(B, T)
+    is_response = pos >= response_start_idx.to(pos.dtype).unsqueeze(1)
+    return is_response.to(attention_mask.dtype) * attention_mask
+
+
+def shift_for_next_token_prediction(logits: torch.Tensor, labels: torch.Tensor):
+    """Align logits at position t with the label at position t+1.
+
+    Returns:
+        Tuple ``(shifted_logits[:, :-1, :], shifted_labels[:, 1:])`` — both
+        contiguous, so they can be safely indexed for the divergence loss.
+    """
+    return logits[:, :-1, :].contiguous(), labels[:, 1:].contiguous()

From 0e1c004c27c54b83de33ef713a59a3d18606c8ee Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 18:19:14 +0800
Subject: [PATCH 4/8] Move teacher.py and data.py from DeepSpeed to
 DeepSpeedExamples

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/data.py                       | 108 +++++++++++
 training/opsd/main.py                       |   4 +-
 training/opsd/teacher.py                    | 191 ++++++++++++++++++++
 training/opsd/tests/test_teacher_caching.py |   2 +-
 4 files changed, 302 insertions(+), 3 deletions(-)
 create mode 100644 training/opsd/data.py
 create mode 100644 training/opsd/teacher.py

diff --git a/training/opsd/data.py b/training/opsd/data.py
new file mode 100644
index 000000000..8ce86b56c
--- /dev/null
+++ b/training/opsd/data.py
@@ -0,0 +1,108 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Prompt dataset and left-padding collator for OPSD rollouts.
+
+The dataset reads a JSONL file with one record per line; each record must
+contain a string under :attr:`DataConfig.prompt_field` (default ``"prompt"``).
+If the tokenizer exposes ``apply_chat_template``, single-turn prompts are
+wrapped in a user-role message with ``add_generation_prompt=True`` so the
+student generates the assistant turn.
+
+Batches are **left-padded** because causal generation requires real tokens at
+    the right edge — :class:`deepspeed.runtime.rollout.RolloutRequest` and the hybrid-engine
+backend both assume this layout.
+"""
+
+import json
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.utils.data import Dataset
+
+
+class PromptDataset(Dataset):
+    """Reads ``{prompt_field: str}`` records from a JSONL file."""
+
+    def __init__(
+        self,
+        path: str,
+        tokenizer: Any,
+        max_prompt_length: int,
+        prompt_field: str = "prompt",
+        chat_template: Optional[str] = None,
+    ):
+        self.records = self._load_jsonl(path)
+        self.tokenizer = tokenizer
+        self.max_prompt_length = max_prompt_length
+        self.prompt_field = prompt_field
+        self.chat_template = chat_template
+
+    @staticmethod
+    def _load_jsonl(path: str) -> List[Dict[str, Any]]:
+        records: List[Dict[str, Any]] = []
+        with open(path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                records.append(json.loads(line))
+        return records
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def __getitem__(self, idx: int) -> str:
+        rec = self.records[idx]
+        if self.prompt_field not in rec:
+            raise KeyError(f"record {idx} missing field {self.prompt_field!r}")
+        text = rec[self.prompt_field]
+
+        # If the tokenizer knows a chat template, render the prompt as a single
+        # user-role turn and request the generation prompt. This matches how
+        # instruction-tuned student/teacher checkpoints expect inputs.
+        if hasattr(self.tokenizer, "apply_chat_template"):
+            messages = [{"role": "user", "content": text}] if isinstance(text, str) else text
+            text = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+        return text
+
+
+class LeftPaddedPromptCollator:
+    """Tokenizes a batch of prompt strings into a left-padded tensor batch."""
+
+    def __init__(self, tokenizer: Any, max_prompt_length: int):
+        self.tokenizer = tokenizer
+        self.max_prompt_length = max_prompt_length
+        self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+        if self.pad_id is None:
+            raise ValueError("tokenizer has neither pad_token_id nor eos_token_id; "
+                             "cannot construct a padding collator")
+
+    def __call__(self, batch_texts: List[str]) -> Dict[str, torch.Tensor]:
+        per_sample = [
+            self.tokenizer(
+                t,
+                add_special_tokens=False,
+                truncation=True,
+                max_length=self.max_prompt_length,
+                return_tensors="pt",
+            )["input_ids"].squeeze(0) for t in batch_texts
+        ]
+        max_len = max(int(x.shape[0]) for x in per_sample)
+        B = len(per_sample)
+
+        prompt_ids = torch.full((B, max_len), self.pad_id, dtype=torch.long)
+        attention_mask = torch.zeros((B, max_len), dtype=torch.long)
+        for i, ids in enumerate(per_sample):
+            n = int(ids.shape[0])
+            # left-pad: real tokens at the right edge
+            prompt_ids[i, max_len - n:] = ids
+            attention_mask[i, max_len - n:] = 1
+
+        return {"prompt_ids": prompt_ids, "prompt_attention_mask": attention_mask}
diff --git a/training/opsd/main.py b/training/opsd/main.py
index 62298829a..91bf4a4d6 100644
--- a/training/opsd/main.py
+++ b/training/opsd/main.py
@@ -25,9 +25,9 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from config import OPSDConfig
-from deepspeed.runtime.rlhf.data import LeftPaddedPromptCollator, PromptDataset
+from data import LeftPaddedPromptCollator, PromptDataset
 from deepspeed.runtime.rollout import build_rollout
-from deepspeed.runtime.rlhf.teacher import TeacherWrapper
+from teacher import TeacherWrapper
 from trainer import OPSDTrainer
 
 
diff --git a/training/opsd/teacher.py b/training/opsd/teacher.py
new file mode 100644
index 000000000..1afaddd68
--- /dev/null
+++ b/training/opsd/teacher.py
@@ -0,0 +1,191 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Frozen teacher: two-phase forward with CPU-cached logits.
+
+The trainer runs each step in two phases:
+
+  1. **Teacher phase.** Forward over the prompt+response. The full ``[B, T, V]``
+     logit tensor is moved off the GPU into a :class:`TeacherLogitCache` so that
+     teacher weight buffers can be released before the student backward pass.
+  2. **Student phase.** Forward + backward on the student. The distillation
+     loss pulls teacher logits back to GPU **one sequence chunk at a time** via
+     :meth:`TeacherLogitCache.chunk_to_device`, so peak GPU memory for teacher
+     data is only ``[B, chunk, V]``.
+
+This module deliberately lazy-imports ``deepspeed`` and ``transformers`` so
+that the pure data-handling pieces (``TeacherLogitCache`` and the streamed
+loss in :mod:`opsd.losses`) remain importable in CPU-only unit tests that do
+not have a working DeepSpeed launcher.
+"""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+# ``opsd.config`` is pure-Python (no distributed imports), so we can import it
+# at module load time without pulling in DeepSpeed.
+from config import TeacherConfig
+
+
+@dataclass
+class TeacherLogitCache:
+    """CPU-resident teacher logits with on-demand chunk fetch.
+
+    Stored in low precision (default ``bfloat16``) to halve host memory; the
+    consumer in :mod:`opsd.losses` promotes back to fp32 inside the divergence
+    so the KD math stays well-conditioned.
+    """
+
+    cpu_logits: torch.Tensor  # [B, T, V]
+
+    def __post_init__(self) -> None:
+        if self.cpu_logits.dim() != 3:
+            raise ValueError(f"cpu_logits must be 3-D [B, T, V]; got shape "
+                             f"{tuple(self.cpu_logits.shape)}")
+        if self.cpu_logits.device.type != "cpu":
+            raise ValueError(f"cpu_logits must live on CPU; got device "
+                             f"{self.cpu_logits.device}")
+
+    @classmethod
+    def from_gpu_logits(cls, logits: torch.Tensor, store_dtype: torch.dtype = torch.bfloat16) -> "TeacherLogitCache":
+        """Detach + downcast + move to (pinned) host memory.
+
+        ``non_blocking=True`` lets the copy overlap with the next CUDA op when
+        the destination is pinned; we try to pin and fall back silently if the
+        host doesn't support it (e.g. CPU-only test environments).
+        """
+        downcast = logits.detach().to(dtype=store_dtype)
+        try:
+            host = torch.empty(downcast.shape, dtype=store_dtype, pin_memory=True)
+            host.copy_(downcast, non_blocking=True)
+        except RuntimeError:
+            host = downcast.cpu()
+        return cls(cpu_logits=host)
+
+    @property
+    def shape(self) -> Tuple[int, int, int]:
+        s = self.cpu_logits.shape
+        return (int(s[0]), int(s[1]), int(s[2]))
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.cpu_logits.dtype
+
+    def chunk_to_device(self,
+                        start: int,
+                        end: int,
+                        device: torch.device,
+                        dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        """Slice ``[:, start:end, :]`` and stage it on ``device``.
+
+        ``dtype`` is the dtype on the destination; if ``None``, the stored
+        dtype is preserved.
+        """
+        _, T, _ = self.shape
+        if not (0 <= start < end <= T):
+            raise ValueError(f"chunk bounds [{start}, {end}) invalid for T={T}")
+        chunk = self.cpu_logits[:, start:end]
+        out = chunk.to(device=device, dtype=dtype if dtype is not None else chunk.dtype, non_blocking=True)
+        return out
+
+    def free(self) -> None:
+        """Drop the underlying buffer so a step's teacher cache can be GC'd
+        before the next teacher forward."""
+        self.cpu_logits = torch.empty(0)
+
+
+_DTYPE_MAP = {
+    "float16": torch.float16,
+    "fp16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "bf16": torch.bfloat16,
+    "float32": torch.float32,
+    "fp32": torch.float32,
+}
+
+
+def _resolve_dtype(name: str) -> torch.dtype:
+    if name not in _DTYPE_MAP:
+        raise ValueError(f"Unknown dtype {name!r}; choose from {sorted(_DTYPE_MAP)}")
+    return _DTYPE_MAP[name]
+
+
+class TeacherWrapper:
+    """Frozen teacher.
+
+    Two modes depending on ``cfg.offload_to_cpu``:
+
+      * ``offload_to_cpu=False`` — load the teacher with HF's standard
+        ``from_pretrained`` and pin it on the local accelerator device. The
+        whole teacher lives in GPU memory; simplest path and what to use when
+        the teacher fits.
+
+      * ``offload_to_cpu=True`` — wrap the loaded model with
+        ``deepspeed.initialize`` using a ZeRO-3 config with
+        ``offload_param.device='cpu'``. The optimizer slot is unused (no
+        trainable params) but ZeRO-3 gives us per-forward parameter gather
+        / release and keeps weights on the host between forwards. This is the
+        path to use when the teacher would otherwise not fit alongside the
+        student.
+
+    Both paths load the full checkpoint on each rank before DeepSpeed (if
+    used) partitions; we intentionally do **not** wrap ``from_pretrained``
+    in ``deepspeed.zero.Init()`` because HF's loader partitions
+    ``low_cpu_mem_usage`` params to zero-width shards before the checkpoint
+    can fill them, which surfaces as a "size mismatch" load error.
+    """
+
+    def __init__(self, cfg: TeacherConfig, world_size: int):
+        from deepspeed.accelerator import get_accelerator
+        from transformers import AutoModelForCausalLM
+
+        self.cfg = cfg
+        dtype = _resolve_dtype(cfg.dtype)
+        device = get_accelerator().current_device_name()
+
+        model = AutoModelForCausalLM.from_pretrained(
+            cfg.model_name_or_path,
+            torch_dtype=dtype,
+            trust_remote_code=cfg.trust_remote_code,
+        )
+        model.eval()
+        for p in model.parameters():
+            p.requires_grad_(False)
+
+        if cfg.offload_to_cpu:
+            import deepspeed
+
+            ds_config = {
+                "train_micro_batch_size_per_gpu": 1,
+                "bf16": {
+                    "enabled": dtype is torch.bfloat16
+                },
+                "fp16": {
+                    "enabled": dtype is torch.float16
+                },
+                "zero_optimization": {
+                    "stage": 3,
+                    "offload_param": {
+                        "device": "cpu"
+                    },
+                },
+            }
+            engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+            self._callable = engine
+            self._uses_ds = True
+        else:
+            model.to(device)
+            self._callable = model
+            self._uses_ds = False
+
+    @torch.no_grad()
+    def forward_to_cache(self,
+                         input_ids: torch.Tensor,
+                         attention_mask: torch.Tensor,
+                         store_dtype: torch.dtype = torch.bfloat16) -> TeacherLogitCache:
+        """Run teacher forward and stage logits onto the host."""
+        outputs = self._callable(input_ids=input_ids, attention_mask=attention_mask)
+        return TeacherLogitCache.from_gpu_logits(outputs.logits, store_dtype=store_dtype)
diff --git a/training/opsd/tests/test_teacher_caching.py b/training/opsd/tests/test_teacher_caching.py
index 090aa3ed7..f44bd6e7a 100644
--- a/training/opsd/tests/test_teacher_caching.py
+++ b/training/opsd/tests/test_teacher_caching.py
@@ -13,7 +13,7 @@
 import pytest
 import torch
 
-from deepspeed.runtime.rlhf.teacher import TeacherLogitCache
+from teacher import TeacherLogitCache
 
 
 def test_round_trip_preserves_values_within_dtype():

From d0000be492af4e9529c79c106d79c10d18725f07 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 20:45:37 +0800
Subject: [PATCH 5/8] Extend RolloutConfig with app-level generation knobs;
 clean vLLM remnants from JSON

- Subclass DeepSpeed's RolloutConfig to add temperature/top_p/etc
- Remove weight_sync_interval from JSON configs (vLLM remnant)

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/config.py                       | 13 ++++++++++++-
 training/opsd/configs/opsd_hybrid_engine.json |  1 -
 training/opsd/configs/smoke_hybrid.json       |  1 -
 training/opsd/configs/smoke_hybrid_gc.json    |  1 -
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/training/opsd/config.py b/training/opsd/config.py
index 66ff7e21e..1ccb9e9c2 100644
--- a/training/opsd/config.py
+++ b/training/opsd/config.py
@@ -13,7 +13,18 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-from deepspeed.runtime.rollout import RolloutConfig
+from deepspeed.runtime.rollout import RolloutConfig as _BaseRolloutConfig
+
+
+@dataclass
+class RolloutConfig(_BaseRolloutConfig):
+    """Extends DeepSpeed's RolloutConfig with OPSD generation knobs."""
+    max_prompt_length: int = 1024
+    max_response_length: int = 1024
+    temperature: float = 0.0
+    top_p: float = 1.0
+    top_k: int = -1
+    n_samples_per_prompt: int = 1
 
 
 @dataclass
diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json
index d2ebb8b03..f83503c27 100644
--- a/training/opsd/configs/opsd_hybrid_engine.json
+++ b/training/opsd/configs/opsd_hybrid_engine.json
@@ -18,7 +18,6 @@
         "top_p": 1.0,
         "top_k": -1,
         "n_samples_per_prompt": 1,
-        "weight_sync_interval": 1
     },
     "distillation": {
         "loss_type": "reverse_kl",
diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json
index 774092926..51d80feea 100644
--- a/training/opsd/configs/smoke_hybrid.json
+++ b/training/opsd/configs/smoke_hybrid.json
@@ -18,7 +18,6 @@
         "top_p": 1.0,
         "top_k": -1,
         "n_samples_per_prompt": 1,
-        "weight_sync_interval": 1
     },
     "distillation": {
         "loss_type": "reverse_kl",
diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json
index 0512c1581..8c563b199 100644
--- a/training/opsd/configs/smoke_hybrid_gc.json
+++ b/training/opsd/configs/smoke_hybrid_gc.json
@@ -19,7 +19,6 @@
         "top_k": -1,
         "n_samples_per_prompt": 1,
         "use_graph_capture": true,
-        "weight_sync_interval": 1
     },
     "distillation": {
         "loss_type": "reverse_kl",

From d3eda203def138f8dfd786277e2641ea74482bf5 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 21:57:46 +0800
Subject: [PATCH 6/8] Keep only bench_decode_1p1r; add --graph-capture flag and
 engine wrapper fix

- Remove 14B/multi-GPU benchmarks (bench_14b_rollout, bench_autotp_gc,
  bench_hybrid_tp, bench_hybrid_tp_opt)
- Fix bench_decode_1p1r: wrap model for HybridEngineRollout
- Add --graph-capture CLI flag

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/benchmarks/bench_14b_rollout.py | 134 ----------------
 training/opsd/benchmarks/bench_autotp_gc.py   |  96 -----------
 training/opsd/benchmarks/bench_decode_1p1r.py |   8 +-
 training/opsd/benchmarks/bench_hybrid_tp.py   | 145 -----------------
 .../opsd/benchmarks/bench_hybrid_tp_opt.py    | 149 ------------------
 5 files changed, 6 insertions(+), 526 deletions(-)
 delete mode 100644 training/opsd/benchmarks/bench_14b_rollout.py
 delete mode 100644 training/opsd/benchmarks/bench_autotp_gc.py
 delete mode 100644 training/opsd/benchmarks/bench_hybrid_tp.py
 delete mode 100644 training/opsd/benchmarks/bench_hybrid_tp_opt.py

diff --git a/training/opsd/benchmarks/bench_14b_rollout.py b/training/opsd/benchmarks/bench_14b_rollout.py
deleted file mode 100644
index d66c7615d..000000000
--- a/training/opsd/benchmarks/bench_14b_rollout.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
-import time
-import os
-import sys
-import torch
-import deepspeed
-from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-MODEL = "Qwen/Qwen2.5-14B-Instruct"
-MAX_NEW_TOKENS = 256
-N_SAMPLES = 1
-CB_SIZE = 1
-N_RUNS = 5
-PROMPT = "def fibonacci(n):"
-
-
-def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
-    rank = torch.distributed.get_rank()
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    rollout = HybridEngineRollout(
-        engine=engine,
-        tokenizer=tokenizer,
-        continuous_batching_size=cb_size,
-        use_graph_capture=use_graph_capture,
-    )
-
-    ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
-    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
-    sampling = SamplingConfig(
-        max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95,
-        n_samples_per_prompt=N_SAMPLES
-    )
-
-    # Warmup
-    torch.manual_seed(42)
-    engine.eval()
-    rollout.generate(req, sampling)
-    engine.train()
-
-    # Benchmark
-    times = []
-    total_toks = 0
-    for i in range(N_RUNS):
-        torch.manual_seed(42 + i)
-        engine.eval()
-        torch.cuda.synchronize()
-        t0 = time.time()
-        batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()
-        times.append(time.time() - t0)
-        engine.train()
-
-    # Count tokens from last run
-    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    for i in range(batch.input_ids.shape[0]):
-        resp = batch.input_ids[i, batch.response_start_idx[i]:]
-        total_toks += (resp != pad_id).sum().item()
-
-    t_avg = sum(times[1:]) / len(times[1:])
-
-    if rank == 0:
-        print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s  "
-              f"runs={[f'{t*1000:.0f}' for t in times]}")
-
-    return total_toks, t_avg
-
-
-def main():
-    deepspeed.init_distributed()
-    rank = torch.distributed.get_rank()
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    torch.cuda.set_device(local_rank)
-
-    world_size = torch.distributed.get_world_size()
-    tp_size = world_size  # all GPUs used for TP
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
-
-    ds_config = {
-        "bf16": {"enabled": True},
-        "zero_optimization": {"stage": 0},
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "hybrid_engine": {
-            "enabled": True,
-            "max_out_tokens": 512,
-            "inference_tp_size": 1,
-            "release_inference_cache": False,
-            "pin_parameters": True,
-            "tp_gather_partition_size": 8,
-        },
-    }
-
-    if tp_size > 1:
-        ds_config["tensor_parallel"] = {
-            "autotp_size": tp_size,
-            "preset_model": "qwen2",
-            "tp": {"tp_size": tp_size},
-        }
-
-    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
-
-    if rank == 0:
-        print(f"\n{'='*60}")
-        print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}")
-        print(f"{'='*60}")
-
-    # 1P1R without graph capture (CB=1, no GC)
-    try:
-        bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}")
-    except Exception as e:
-        if rank == 0:
-            print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
-            import traceback; traceback.print_exc()
-
-    # 1P1R with CUDA graph capture
-    try:
-        bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC")
-    except Exception as e:
-        if rank == 0:
-            print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
-            import traceback; traceback.print_exc()
-
-    if rank == 0:
-        print(f"{'='*60}\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/opsd/benchmarks/bench_autotp_gc.py b/training/opsd/benchmarks/bench_autotp_gc.py
deleted file mode 100644
index c9a245b24..000000000
--- a/training/opsd/benchmarks/bench_autotp_gc.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Benchmark rollout with AutoTP + graph capture on 14B model."""
-import time
-import torch
-import deepspeed
-from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-def main():
-    deepspeed.init_distributed()
-    rank = torch.distributed.get_rank()
-    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
-    device = torch.device(f"cuda:{local_rank}")
-
-    model_name = "Qwen/Qwen2.5-14B-Instruct"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, dtype=torch.bfloat16, trust_remote_code=True
-    )
-
-    ds_config = {
-        "bf16": {"enabled": True},
-        "zero_optimization": {"stage": 0},
-        "tensor_parallel": {
-            "autotp_size": 2,
-            "preset_model": "qwen2",
-            "tp": {"tp_size": 2},
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": 2,
-        "gradient_accumulation_steps": 1,
-        "hybrid_engine": {
-            "enabled": True,
-            "max_out_tokens": 512,
-            "inference_tp_size": 1,
-            "release_inference_cache": False,
-            "pin_parameters": True,
-            "tp_gather_partition_size": 8,
-        },
-    }
-
-    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
-
-    rollout = HybridEngineRollout(
-        engine=engine,
-        tokenizer=tokenizer,
-        continuous_batching_size=2,
-        use_graph_capture=True,
-    )
-
-    # Prepare prompt
-    prompt = "def fibonacci(n):"
-    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
-    sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4)
-
-    # Warmup
-    torch.manual_seed(42)
-    engine.eval()
-    rollout.generate(req, sampling)
-    engine.train()
-
-    # Benchmark
-    times = []
-    for i in range(5):
-        torch.manual_seed(42)
-        engine.eval()
-        torch.cuda.synchronize()
-        t0 = time.time()
-        batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()
-        times.append(time.time() - t0)
-        engine.train()
-
-    t_avg = sum(times[1:]) / len(times[1:])
-    # Count tokens
-    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    total_toks = 0
-    for i in range(batch.input_ids.shape[0]):
-        resp = batch.input_ids[i, batch.response_start_idx[i]:]
-        total_toks += (resp != pad_id).sum().item()
-
-    if rank == 0:
-        print(f"\n{'='*60}")
-        print(f"Model: {model_name}")
-        print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256")
-        print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms")
-        print(f"Total response tokens: {total_toks}")
-        print(f"Throughput: {total_toks/t_avg:.1f} tok/s")
-        print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}")
-        print(f"{'='*60}\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/opsd/benchmarks/bench_decode_1p1r.py b/training/opsd/benchmarks/bench_decode_1p1r.py
index 58fb667d4..645a5fbd1 100644
--- a/training/opsd/benchmarks/bench_decode_1p1r.py
+++ b/training/opsd/benchmarks/bench_decode_1p1r.py
@@ -140,6 +140,7 @@ def main():
     parser.add_argument("--max-new-tokens", type=int, default=64)
     parser.add_argument("--num-warmup", type=int, default=3)
     parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--graph-capture", action="store_true", help="Enable CUDA graph capture")
     args = parser.parse_args()
 
     device = get_accelerator().current_device()  #ignore-cuda
@@ -161,8 +162,11 @@ def main():
     print(f"  Overhead/step:        {raw['overhead_ms_per_step']:.3f} ms  (total: {raw['overhead_total_ms']:.1f} ms)")
     print(f"  Total:                {raw['total_ms']:.1f} ms")
 
-    print(f"\n=== HybridEngineRollout benchmark ===")
-    rollout = HybridEngineRollout(model, tokenizer)
+    print(f"\n=== HybridEngineRollout benchmark (graph_capture={args.graph_capture}) ===")
+    engine = type('Engine', (), {'module': model})()  # lightweight wrapper
+    from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig
+    cfg = HybridEngineRolloutConfig(use_graph_capture=args.graph_capture)
+    rollout = HybridEngineRollout(engine, tokenizer, cfg=cfg)
     rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
                               args.num_iters)
     print(f"  Rollout generate:     {rr['rollout_total_ms']:.1f} ms")
diff --git a/training/opsd/benchmarks/bench_hybrid_tp.py b/training/opsd/benchmarks/bench_hybrid_tp.py
deleted file mode 100644
index 3f41150c7..000000000
--- a/training/opsd/benchmarks/bench_hybrid_tp.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-# DeepSpeed Team
-"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2).
-
-Usage:
-    deepspeed --num_gpus 2 bench_hybrid_tp.py \
-        --model Qwen/Qwen2.5-14B-Instruct \
-        --max-new-tokens 64
-"""
-
-import argparse
-import os
-import time
-
-import deepspeed
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    torch.manual_seed(42)
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for i in range(num_warmup + num_iters):
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            result = rollout.generate(request, sampling)
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        elapsed = time.perf_counter() - t0
-        times.append(elapsed)
-        if local_rank == 0:
-            label = "warmup" if i < num_warmup else "iter"
-            n_tokens = result.input_ids.shape[-1] - prompt_len
-            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
-
-    avg = np.mean(times[-num_iters:]) * 1000
-    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    local_rank = args.local_rank
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    deepspeed.init_distributed()
-
-    if local_rank == 0:
-        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===")
-        print(f"  Model:       {args.model}")
-        print(f"  TP size:     {world_size}")
-        print(f"  Prompt len:  {args.prompt_len}")
-        print(f"  Decode len:  {args.max_new_tokens}")
-        print(f"  Warmup:      {args.num_warmup}")
-        print(f"  Iters:       {args.num_iters}")
-        print()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torch_dtype=torch.bfloat16,
-    )
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "tensor_parallel": {
-            "autotp_size": world_size,
-            "preset_model": "qwen2",
-        },
-    }
-
-    engine, *_ = deepspeed.initialize(
-        model=model,
-        optimizer=None,
-        model_parameters=model.parameters(),
-        config=ds_config,
-    )
-
-    if local_rank == 0:
-        print("  DeepSpeed engine initialized.")
-        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        print(f"  Parameters (local):  {param_count:.2f}B")
-        print(f"  GPU mem allocated:   {alloc:.1f} GB")
-        print()
-
-    rollout = HybridEngineRollout(engine, tokenizer)
-
-    if local_rank == 0:
-        print("  Running benchmark...")
-
-    result = bench_hybrid_rollout(
-        rollout,
-        tokenizer,
-        args.prompt_len,
-        args.max_new_tokens,
-        args.num_warmup,
-        args.num_iters,
-    )
-
-    if local_rank == 0:
-        total = result["rollout_total_ms"]
-        per_step = total / args.max_new_tokens
-        throughput = 1000.0 / per_step
-        print()
-        print(f"=== Results ===")
-        print(f"  Total generate:   {total:.1f} ms")
-        print(f"  Per decode step:  {per_step:.2f} ms")
-        print(f"  Throughput:       {throughput:.1f} tokens/s")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/opsd/benchmarks/bench_hybrid_tp_opt.py b/training/opsd/benchmarks/bench_hybrid_tp_opt.py
deleted file mode 100644
index d7fae2dde..000000000
--- a/training/opsd/benchmarks/bench_hybrid_tp_opt.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-# DeepSpeed Team
-"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer.
-
-Usage:
-    deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \
-        --model Qwen/Qwen2.5-14B-Instruct \
-        --max-new-tokens 64
-"""
-
-import argparse
-import os
-import time
-
-import deepspeed
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    torch.manual_seed(42)
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for i in range(num_warmup + num_iters):
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            result = rollout.generate(request, sampling)
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        elapsed = time.perf_counter() - t0
-        times.append(elapsed)
-        if local_rank == 0:
-            label = "warmup" if i < num_warmup else "iter"
-            n_tokens = result.input_ids.shape[-1] - prompt_len
-            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
-
-    avg = np.mean(times[-num_iters:]) * 1000
-    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    local_rank = args.local_rank
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    deepspeed.init_distributed()
-
-    if local_rank == 0:
-        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===")
-        print(f"  Model:       {args.model}")
-        print(f"  TP size:     {world_size}")
-        print(f"  Prompt len:  {args.prompt_len}")
-        print(f"  Decode len:  {args.max_new_tokens}")
-        print()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torch_dtype=torch.bfloat16,
-    )
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "tensor_parallel": {
-            "autotp_size": world_size,
-            "preset_model": "qwen2",
-        },
-    }
-
-    engine, _, _, _ = deepspeed.initialize(
-        model=model,
-        model_parameters=model.parameters(),
-        config=ds_config,
-    )
-
-    if local_rank == 0:
-        print("  DeepSpeed engine initialized (with optimizer).")
-        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  Parameters (local):  {param_count:.2f}B")
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  GPU mem allocated:   {alloc:.1f} GB")
-        print(f"  GPU mem reserved:    {reserv:.1f} GB")
-        print()
-
-    rollout = HybridEngineRollout(engine, tokenizer)
-
-    if local_rank == 0:
-        print("  Running benchmark...")
-
-    result = bench_hybrid_rollout(
-        rollout,
-        tokenizer,
-        args.prompt_len,
-        args.max_new_tokens,
-        args.num_warmup,
-        args.num_iters,
-    )
-
-    if local_rank == 0:
-        total = result["rollout_total_ms"]
-        per_step = total / args.max_new_tokens
-        throughput = 1000.0 / per_step
-        print()
-        print(f"=== Results ===")
-        print(f"  Total generate:   {total:.1f} ms")
-        print(f"  Per decode step:  {per_step:.2f} ms")
-        print(f"  Throughput:       {throughput:.1f} tokens/s")
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  GPU mem (final):   alloc={alloc:.1f} GB, reserved={reserv:.1f} GB")
-
-
-if __name__ == "__main__":
-    main()

From 8bf134e1d841ac4b931e77ca034b2e7750abd534 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 23:06:41 +0800
Subject: [PATCH 7/8] Fix distillation temperature from 0 to 1.0 in smoke and
 production configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

temperature=0 causes logits/0 = inf → NaN loss. The correct default
for knowledge distillation is temperature=1.0 (standard softmax).

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/configs/opsd_hybrid_engine.json | 2 +-
 training/opsd/configs/smoke_hybrid.json       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json
index f83503c27..feeef50c9 100644
--- a/training/opsd/configs/opsd_hybrid_engine.json
+++ b/training/opsd/configs/opsd_hybrid_engine.json
@@ -21,7 +21,7 @@
     },
     "distillation": {
         "loss_type": "reverse_kl",
-        "temperature": 0,
+        "temperature": 1.0,
         "chunk_size": 512
     },
     "training": {
diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json
index 51d80feea..c936694b8 100644
--- a/training/opsd/configs/smoke_hybrid.json
+++ b/training/opsd/configs/smoke_hybrid.json
@@ -21,7 +21,7 @@
     },
     "distillation": {
         "loss_type": "reverse_kl",
-        "temperature": 0,
+        "temperature": 1.0,
         "chunk_size": 128
     },
     "training": {

From 5d1d2e54b0dc53d4df9fe5709bf64a865d3a1a78 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 3 Jul 2026 23:18:24 +0800
Subject: [PATCH 8/8] Fix trailing commas in JSON configs; remove unused
 smoke_ds_zero3.json

Signed-off-by: Guokai Ma <guokai.ma@gmail.com>
---
 training/opsd/configs/opsd_hybrid_engine.json |  8 ++---
 training/opsd/configs/smoke_ds_zero3.json     | 35 -------------------
 training/opsd/configs/smoke_hybrid.json       |  8 ++---
 training/opsd/configs/smoke_hybrid_gc.json    |  8 ++---
 4 files changed, 12 insertions(+), 47 deletions(-)
 delete mode 100644 training/opsd/configs/smoke_ds_zero3.json

diff --git a/training/opsd/configs/opsd_hybrid_engine.json b/training/opsd/configs/opsd_hybrid_engine.json
index feeef50c9..3478a1fa1 100644
--- a/training/opsd/configs/opsd_hybrid_engine.json
+++ b/training/opsd/configs/opsd_hybrid_engine.json
@@ -2,7 +2,7 @@
     "student": {
         "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
         "dtype": "bfloat16",
-        "trust_remote_code": false,
+        "trust_remote_code": false
     },
     "teacher": {
         "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
@@ -17,7 +17,7 @@
         "temperature": 0,
         "top_p": 1.0,
         "top_k": -1,
-        "n_samples_per_prompt": 1,
+        "n_samples_per_prompt": 1
     },
     "distillation": {
         "loss_type": "reverse_kl",
@@ -28,7 +28,7 @@
         "train_batch_size": 1,
         "micro_batch_size_per_gpu": 1,
         "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
+        "learning_rate": 1e-06,
         "weight_decay": 0.0,
         "num_train_epochs": 1,
         "max_steps": -1,
@@ -44,4 +44,4 @@
         "shuffle": true
     },
     "deepspeed_config": "configs/ds_zero3.json"
-}
+}
\ No newline at end of file
diff --git a/training/opsd/configs/smoke_ds_zero3.json b/training/opsd/configs/smoke_ds_zero3.json
deleted file mode 100644
index 74211f3fb..000000000
--- a/training/opsd/configs/smoke_ds_zero3.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "bf16": {
-        "enabled": true
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "reduce_bucket_size": 5e7,
-        "stage3_prefetch_bucket_size": 5e7,
-        "stage3_param_persistence_threshold": 1e6,
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 1e-6,
-            "betas": [0.9, 0.95],
-            "eps": 1e-8,
-            "weight_decay": 0.0
-        }
-    },
-    "gradient_clipping": 1.0,
-    "hybrid_engine": {
-        "enabled": true,
-        "max_out_tokens": 512,
-        "inference_tp_size": 1,
-        "release_inference_cache": false,
-        "pin_parameters": true,
-        "tp_gather_partition_size": 8
-    },
-    "wall_clock_breakdown": false
-}
diff --git a/training/opsd/configs/smoke_hybrid.json b/training/opsd/configs/smoke_hybrid.json
index c936694b8..250214ddc 100644
--- a/training/opsd/configs/smoke_hybrid.json
+++ b/training/opsd/configs/smoke_hybrid.json
@@ -2,7 +2,7 @@
     "student": {
         "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
         "dtype": "bfloat16",
-        "trust_remote_code": false,
+        "trust_remote_code": false
     },
     "teacher": {
         "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
@@ -17,7 +17,7 @@
         "temperature": 0,
         "top_p": 1.0,
         "top_k": -1,
-        "n_samples_per_prompt": 1,
+        "n_samples_per_prompt": 1
     },
     "distillation": {
         "loss_type": "reverse_kl",
@@ -28,7 +28,7 @@
         "train_batch_size": 1,
         "micro_batch_size_per_gpu": 1,
         "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
+        "learning_rate": 1e-06,
         "weight_decay": 0.0,
         "num_train_epochs": 1,
         "max_steps": 5,
@@ -44,4 +44,4 @@
         "shuffle": true
     },
     "deepspeed_config": "configs/smoke_ds_zero0.json"
-}
+}
\ No newline at end of file
diff --git a/training/opsd/configs/smoke_hybrid_gc.json b/training/opsd/configs/smoke_hybrid_gc.json
index 8c563b199..e32d070a9 100644
--- a/training/opsd/configs/smoke_hybrid_gc.json
+++ b/training/opsd/configs/smoke_hybrid_gc.json
@@ -2,7 +2,7 @@
     "student": {
         "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
         "dtype": "bfloat16",
-        "trust_remote_code": false,
+        "trust_remote_code": false
     },
     "teacher": {
         "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
@@ -18,7 +18,7 @@
         "top_p": 1.0,
         "top_k": -1,
         "n_samples_per_prompt": 1,
-        "use_graph_capture": true,
+        "use_graph_capture": true
     },
     "distillation": {
         "loss_type": "reverse_kl",
@@ -29,7 +29,7 @@
         "train_batch_size": 1,
         "micro_batch_size_per_gpu": 1,
         "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
+        "learning_rate": 1e-06,
         "weight_decay": 0.0,
         "num_train_epochs": 1,
         "max_steps": 5,
@@ -45,4 +45,4 @@
         "shuffle": true
     },
     "deepspeed_config": "configs/smoke_ds_zero0.json"
-}
+}
\ No newline at end of file