diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py
new file mode 100644
index 000000000000..705563035f94
--- /dev/null
+++ b/benchmarks/opsd/bench_14b_rollout.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
+import time
+import os
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL = "Qwen/Qwen2.5-14B-Instruct"
+MAX_NEW_TOKENS = 256
+N_SAMPLES = 1
+CB_SIZE = 1
+N_RUNS = 5
+PROMPT = "def fibonacci(n):"
+
+
+def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=cb_size,
+        use_graph_capture=use_graph_capture,
+    )
+
+    ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(max_new_tokens=MAX_NEW_TOKENS,
+                              temperature=0.8,
+                              top_p=0.95,
+                              n_samples_per_prompt=N_SAMPLES)
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    total_toks = 0
+    for i in range(N_RUNS):
+        torch.manual_seed(42 + i)
+        engine.eval()
+        torch.cuda.synchronize()  #ignore-cuda
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()  #ignore-cuda
+        times.append(time.time() - t0)
+        engine.train()
+
+    # Count tokens from last run
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+
+    if rank == 0:
+        print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s  "
+              f"runs={[f'{t*1000:.0f}' for t in times]}")
+
+    return total_toks, t_avg
+
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)  #ignore-cuda
+
+    world_size = torch.distributed.get_world_size()
+    tp_size = world_size  # all GPUs used for TP
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    if tp_size > 1:
+        ds_config["tensor_parallel"] = {
+            "autotp_size": tp_size,
+            "preset_model": "qwen2",
+            "tp": {
+                "tp_size": tp_size
+            },
+        }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}")
+        print(f"{'='*60}")
+
+    # 1P1R without graph capture (CB=1, no GC)
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
+            import traceback
+            traceback.print_exc()
+
+    # 1P1R with CUDA graph capture
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
+            import traceback
+            traceback.print_exc()
+
+    if rank == 0:
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py
new file mode 100644
index 000000000000..417cb1ec7421
--- /dev/null
+++ b/benchmarks/opsd/bench_autotp_gc.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark rollout with AutoTP + graph capture on 14B model."""
+import time
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()  #ignore-cuda
+    torch.cuda.set_device(local_rank)  #ignore-cuda
+    device = torch.device(f"cuda:{local_rank}")
+
+    model_name = "Qwen/Qwen2.5-14B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, trust_remote_code=True)
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "tensor_parallel": {
+            "autotp_size": 2,
+            "preset_model": "qwen2",
+            "tp": {
+                "tp_size": 2
+            },
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": 2,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=2,
+        use_graph_capture=True,
+    )
+
+    # Prepare prompt
+    prompt = "def fibonacci(n):"
+    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4)
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    for i in range(5):
+        torch.manual_seed(42)
+        engine.eval()
+        torch.cuda.synchronize()  #ignore-cuda
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()  #ignore-cuda
+        times.append(time.time() - t0)
+        engine.train()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+    # Count tokens
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    total_toks = 0
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {model_name}")
+        print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256")
+        print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms")
+        print(f"Total response tokens: {total_toks}")
+        print(f"Throughput: {total_toks/t_avg:.1f} tok/s")
+        print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}")
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py
new file mode 100644
index 000000000000..1f96b6a4d5d5
--- /dev/null
+++ b/benchmarks/opsd/bench_vllm_tp2.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark vLLM TP=2 on 14B, 1P1R.
+
+Launched as a subprocess wrapper to avoid CUDA fork issues.
+"""
+import subprocess, sys, os
+
+script = '''
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+import time
+from vllm import LLM, SamplingParams
+
+llm = LLM("Qwen/Qwen2.5-14B-Instruct", tensor_parallel_size=2,
+          gpu_memory_utilization=0.85, dtype="bfloat16", enforce_eager=True)
+sp = SamplingParams(max_tokens=256, temperature=0.8, top_p=0.95, n=1)
+prompt = "def fibonacci(n):"
+
+# warmup
+llm.generate([prompt], sp)
+
+times = []
+for i in range(5):
+    t0 = time.time()
+    out = llm.generate([prompt], sp)
+    times.append(time.time() - t0)
+
+t_avg = sum(times[1:]) / len(times[1:])
+total_toks = sum(len(o.token_ids) for r in out for o in r.outputs)
+print(f"vLLM TP=2 14B 1P1R: {total_toks} toks, {t_avg*1000:.1f}ms, {total_toks/t_avg:.1f} tok/s")
+print(f"Per-run: {[f'{t*1000:.0f}ms' for t in times]}")
+'''
+
+# Write to temp file and exec in a fresh process with no prior CUDA init
+tmp = "/tmp/bench_vllm_inner.py"
+with open(tmp, "w") as f:
+    f.write(script)
+
+env = os.environ.copy()
+env.pop("CUDA_VISIBLE_DEVICES", None)
+proc = subprocess.run([sys.executable, tmp], env=env)
+sys.exit(proc.returncode)