diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py new file mode 100644 index 000000000000..705563035f94 --- /dev/null +++ b/benchmarks/opsd/bench_14b_rollout.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC.""" +import time +import os +import torch +import deepspeed +from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL = "Qwen/Qwen2.5-14B-Instruct" +MAX_NEW_TOKENS = 256 +N_SAMPLES = 1 +CB_SIZE = 1 +N_RUNS = 5 +PROMPT = "def fibonacci(n):" + + +def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label): + rank = torch.distributed.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + device = torch.device(f"cuda:{local_rank}") + + rollout = HybridEngineRollout( + engine=engine, + tokenizer=tokenizer, + continuous_batching_size=cb_size, + use_graph_capture=use_graph_capture, + ) + + ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device) + req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) + sampling = SamplingConfig(max_new_tokens=MAX_NEW_TOKENS, + temperature=0.8, + top_p=0.95, + n_samples_per_prompt=N_SAMPLES) + + # Warmup + torch.manual_seed(42) + engine.eval() + rollout.generate(req, sampling) + engine.train() + + # Benchmark + times = [] + total_toks = 0 + for i in range(N_RUNS): + torch.manual_seed(42 + i) + engine.eval() + torch.cuda.synchronize() #ignore-cuda + t0 = time.time() + batch = rollout.generate(req, sampling) + torch.cuda.synchronize() #ignore-cuda + times.append(time.time() - t0) + engine.train() + + # Count tokens from last run + pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id + for i in range(batch.input_ids.shape[0]): + resp = batch.input_ids[i, batch.response_start_idx[i]:] + total_toks += (resp != pad_id).sum().item() + + t_avg = sum(times[1:]) / len(times[1:]) + + if rank == 0: + print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s " + f"runs={[f'{t*1000:.0f}' for t in times]}") + + return total_toks, t_avg + + +def main(): + deepspeed.init_distributed() + rank = torch.distributed.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) #ignore-cuda + + world_size = torch.distributed.get_world_size() + tp_size = world_size # all GPUs used for TP + + tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True) + + ds_config = { + "bf16": { + "enabled": True + }, + "zero_optimization": { + "stage": 0 + }, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": world_size, + "gradient_accumulation_steps": 1, + "hybrid_engine": { + "enabled": True, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": False, + "pin_parameters": True, + "tp_gather_partition_size": 8, + }, + } + + if tp_size > 1: + ds_config["tensor_parallel"] = { + "autotp_size": tp_size, + "preset_model": "qwen2", + "tp": { + "tp_size": tp_size + }, + } + + engine, *_ = deepspeed.initialize(model=model, config=ds_config) + + if rank == 0: + print(f"\n{'='*60}") + print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}") + print(f"{'='*60}") + + # 1P1R without graph capture (CB=1, no GC) + try: + bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}") + except Exception as e: + if rank == 0: + print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}") + import traceback + traceback.print_exc() + + # 1P1R with CUDA graph capture + try: + bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC") + except Exception as e: + if rank == 0: + print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}") + import traceback + traceback.print_exc() + + if rank == 0: + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py new file mode 100644 index 000000000000..417cb1ec7421 --- /dev/null +++ b/benchmarks/opsd/bench_autotp_gc.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Benchmark rollout with AutoTP + graph capture on 14B model.""" +import time +import torch +import deepspeed +from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def main(): + deepspeed.init_distributed() + rank = torch.distributed.get_rank() + local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count() #ignore-cuda + torch.cuda.set_device(local_rank) #ignore-cuda + device = torch.device(f"cuda:{local_rank}") + + model_name = "Qwen/Qwen2.5-14B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, trust_remote_code=True) + + ds_config = { + "bf16": { + "enabled": True + }, + "zero_optimization": { + "stage": 0 + }, + "tensor_parallel": { + "autotp_size": 2, + "preset_model": "qwen2", + "tp": { + "tp_size": 2 + }, + }, + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 2, + "gradient_accumulation_steps": 1, + "hybrid_engine": { + "enabled": True, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": False, + "pin_parameters": True, + "tp_gather_partition_size": 8, + }, + } + + engine, *_ = deepspeed.initialize(model=model, config=ds_config) + + rollout = HybridEngineRollout( + engine=engine, + tokenizer=tokenizer, + continuous_batching_size=2, + use_graph_capture=True, + ) + + # Prepare prompt + prompt = "def fibonacci(n):" + ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids)) + sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4) + + # Warmup + torch.manual_seed(42) + engine.eval() + rollout.generate(req, sampling) + engine.train() + + # Benchmark + times = [] + for i in range(5): + torch.manual_seed(42) + engine.eval() + torch.cuda.synchronize() #ignore-cuda + t0 = time.time() + batch = rollout.generate(req, sampling) + torch.cuda.synchronize() #ignore-cuda + times.append(time.time() - t0) + engine.train() + + t_avg = sum(times[1:]) / len(times[1:]) + # Count tokens + pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id + total_toks = 0 + for i in range(batch.input_ids.shape[0]): + resp = batch.input_ids[i, batch.response_start_idx[i]:] + total_toks += (resp != pad_id).sum().item() + + if rank == 0: + print(f"\n{'='*60}") + print(f"Model: {model_name}") + print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256") + print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms") + print(f"Total response tokens: {total_toks}") + print(f"Throughput: {total_toks/t_avg:.1f} tok/s") + print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py new file mode 100644 index 000000000000..1f96b6a4d5d5 --- /dev/null +++ b/benchmarks/opsd/bench_vllm_tp2.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# DeepSpeed Team +"""Benchmark vLLM TP=2 on 14B, 1P1R. + +Launched as a subprocess wrapper to avoid CUDA fork issues. +""" +import subprocess, sys, os + +script = ''' +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +import time +from vllm import LLM, SamplingParams + +llm = LLM("Qwen/Qwen2.5-14B-Instruct", tensor_parallel_size=2, + gpu_memory_utilization=0.85, dtype="bfloat16", enforce_eager=True) +sp = SamplingParams(max_tokens=256, temperature=0.8, top_p=0.95, n=1) +prompt = "def fibonacci(n):" + +# warmup +llm.generate([prompt], sp) + +times = [] +for i in range(5): + t0 = time.time() + out = llm.generate([prompt], sp) + times.append(time.time() - t0) + +t_avg = sum(times[1:]) / len(times[1:]) +total_toks = sum(len(o.token_ids) for r in out for o in r.outputs) +print(f"vLLM TP=2 14B 1P1R: {total_toks} toks, {t_avg*1000:.1f}ms, {total_toks/t_avg:.1f} tok/s") +print(f"Per-run: {[f'{t*1000:.0f}ms' for t in times]}") +''' + +# Write to temp file and exec in a fresh process with no prior CUDA init +tmp = "/tmp/bench_vllm_inner.py" +with open(tmp, "w") as f: + f.write(script) + +env = os.environ.copy() +env.pop("CUDA_VISIBLE_DEVICES", None) +proc = subprocess.run([sys.executable, tmp], env=env) +sys.exit(proc.returncode)