Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/models/qwen3/launch_distributed_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python3
"""Wrapper that launches torchrun under scalene.

Called by ``test.sh --profile``, not directly. Scalene initializes in this
process and ``--profile-all`` causes ``redirect_python`` to replace
``python`` on PATH. When torchrun spawns workers via subprocess, they pick
up the redirected python and each gets its own dormant scalene instance
(``--off``). Only rank 0's KernelProfiler calls
``scalene_profiler.start()``/``stop()`` to activate CPU profiling.
"""

import sys
import torch.distributed.run as torchrun_main

if __name__ == "__main__":
sys.exit(torchrun_main.main())
84 changes: 84 additions & 0 deletions examples/models/qwen3/profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
"""Profile Qwen3-30B with device tracing in a distributed (TP) setting.

Run via test.sh::

bash test.sh --profile
"""

import argparse
import os
import sys

import torch.distributed as dist

# Add model dir to path so we can import qwen3 modules
_MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, _MODEL_DIR)
os.chdir(_MODEL_DIR)

from nkipy.tools.profiler import KernelProfiler
from qwen3 import Qwen3Model, load_model


def main():
parser = argparse.ArgumentParser(description="Profile Qwen3 distributed")
parser.add_argument("-n", "--max-new-tokens", type=int, default=16)
parser.add_argument("prompt", nargs="?", default="The capital of France is")
parser.add_argument("--checkpoint", default="./tmp_qwen3-30b-a3b")
parser.add_argument("--model", default="Qwen/Qwen3-30B-A3B")
parser.add_argument(
"--profile-all-ranks",
action="store_true",
help="Profile all ranks (default: rank 0 only)",
)
parser.add_argument(
"--output-dir", default=None, help="Output directory for profiles"
)
parser.add_argument(
"--no-scalene",
action="store_true",
help="Disable scalene CPU profiling",
)
args = parser.parse_args()

output_dir = args.output_dir
if output_dir is None:
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))

# load_model handles dist init, weight loading, kernel compilation, warmup
model, input_ids, tokenizer = load_model(args)
rank = dist.get_rank()
world_size = dist.get_world_size()

target_ranks = list(range(world_size)) if args.profile_all_ranks else [0]
output_path = os.path.join(output_dir, "kernel_profile.json")

# Scalene CPU profiling only on rank 0 (requires process to be launched
# under `scalene run --off`). Other ranks skip scalene entirely.
use_scalene = (not args.no_scalene) and (rank == 0)

dist.barrier()

# Each rank sees its core as local core 0 via NEURON_RT_VISIBLE_CORES
with KernelProfiler(
core_id=0,
scalene=use_scalene,
output_path=output_path,
target_ranks=target_ranks,
):
t = 0
for token_id in model.generate(input_ids):
t += 1
output_id = token_id[0].tolist()
if output_id[-1] in [151643, 151645]:
break
if rank == 0:
print(tokenizer.decode(output_id), end="", flush=True)

if rank == 0:
print(f"\nGenerated {t} tokens")


if __name__ == "__main__":
main()
75 changes: 67 additions & 8 deletions examples/models/qwen3/test.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
#!/bin/bash
# Test script for Qwen3-30B-A3B on Trainium
# Usage: bash test.sh
# Usage: bash test.sh [--profile [--no-scalene]]

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"

PROFILE=""
NO_SCALENE=""
for arg in "$@"; do
case $arg in
--profile) PROFILE=1 ;;
--no-scalene) NO_SCALENE=1 ;;
esac
done

echo "=========================================="
echo "Qwen3-30B-A3B Test Script"
echo "=========================================="
Expand All @@ -29,15 +41,62 @@ else
echo "✓ Weights found at $WEIGHTS_PATH"
fi

# Step 3: Run example
# Ensure NRT inspect is disabled (conflicts with SystemTraceSession)
unset NEURON_RT_INSPECT_ENABLE NEURON_RT_INSPECT_OUTPUT_DIR 2>/dev/null || true

# Step 3: Run
echo ""
echo "[3/3] Running Qwen3 inference..."
echo "=========================================="
if [ -n "$PROFILE" ]; then
echo "[3/3] Running profiled inference (TP=$TP_DEGREE)..."
echo "=========================================="

if [ -z "$NO_SCALENE" ]; then
# Scalene on rank 0 + device tracing
scalene run --off --profile-all --cpu-only \
-o "$SCRIPT_DIR/scalene_profile.json" --- \
"$SCRIPT_DIR/launch_distributed_worker.py" \
--nproc-per-node "$TP_DEGREE" \
"$SCRIPT_DIR/profile.py" \
--output-dir "$SCRIPT_DIR" \
--checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B || true

# Enable async to improve performance
export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
# export NEURON_LOGICAL_NC_CONFIG=1
torchrun --nproc-per-node "$TP_DEGREE" qwen3.py -n 500 --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B
# Merge CPU + device profiles
cd "$REPO_DIR"
if [ -f "$SCRIPT_DIR/scalene_profile.json" ]; then
uv run python -m nkipy.tools.profiler \
"$SCRIPT_DIR/scalene_profile.json" \
"$SCRIPT_DIR/kernel_profile.json" \
"$SCRIPT_DIR/merged_profile.json"
else
uv run python -m nkipy.tools.profiler --kernel-only \
"$SCRIPT_DIR/kernel_profile.json" \
"$SCRIPT_DIR/merged_profile.json"
fi
echo ""
echo "View: scalene view $SCRIPT_DIR/merged_profile.json"
else
# Device tracing only, no scalene
export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
torchrun --nproc-per-node "$TP_DEGREE" \
"$SCRIPT_DIR/profile.py" \
--output-dir "$SCRIPT_DIR" --no-scalene \
--checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B

cd "$REPO_DIR"
uv run python -m nkipy.tools.profiler --kernel-only \
"$SCRIPT_DIR/kernel_profile.json" \
"$SCRIPT_DIR/merged_profile.json"
echo ""
echo "View: scalene view $SCRIPT_DIR/merged_profile.json"
fi
else
echo "[3/3] Running Qwen3 inference..."
echo "=========================================="

# Enable async to improve performance
export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
torchrun --nproc-per-node "$TP_DEGREE" qwen3.py -n 500 --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B
fi

echo ""
echo "=========================================="
Expand Down
124 changes: 124 additions & 0 deletions examples/models/qwen3_embedding/profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""Profile Qwen3-Embedding with scalene CPU profiling + device tracing.

Run via test.sh::

bash test.sh --profile
"""

import argparse
import logging
import os
import sys
from pathlib import Path

import numpy as np

# Save original CWD for output path resolution, then chdir to model dir
# (config uses relative paths for weights_dir and build_dir)
_ORIG_CWD = os.getcwd()
_MODEL_DIR = str(Path(__file__).resolve().parent)
sys.path.insert(0, _MODEL_DIR)
os.chdir(_MODEL_DIR)

from config import get_config
from embedding_utils import get_detailed_instruct
from model import Qwen3EmbeddingModel
from nkipy.tools.profiler import KernelProfiler
from prepare_weights import load_qwen3_weights
from transformers import AutoTokenizer


def main():
# Resolve output path before chdir happens at import time
parser = argparse.ArgumentParser(description="Profile Qwen3-Embedding")
parser.add_argument(
"--model-size",
choices=["0.6b", "8b"],
default="0.6b",
help="Model size (default: 0.6b)",
)
parser.add_argument("--lnc", type=int, choices=[1, 2], default=2)
parser.add_argument("--seq-len", type=int, default=None)
parser.add_argument(
"--output",
default="kernel_profile.json",
help="Kernel profile output path (default: kernel_profile.json)",
)
parser.add_argument("--num-warmup", type=int, default=3)
parser.add_argument(
"--num-iterations",
type=int,
default=1,
help="Forward passes inside profiled region (default: 1)",
)
parser.add_argument(
"--no-scalene",
action="store_true",
help="Disable scalene integration",
)
args = parser.parse_args()

# Resolve output path relative to original CWD (not the model dir)
output_path = Path(args.output)
if not output_path.is_absolute():
output_path = Path(_ORIG_CWD) / output_path

# Suppress verbose logging
logging.getLogger().setLevel(logging.ERROR)

overrides = {}
if args.seq_len is not None:
overrides["max_model_len"] = args.seq_len

config = get_config(args.model_size, **overrides)

print(f"Model: {config.model_name}")
print(f"Sequence length: {config.max_model_len}")
print(f"LNC: {args.lnc}")

# Load tokenizer and model
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

print("Loading model and compiling kernels...")
weights = load_qwen3_weights(config.weights_path)
model = Qwen3EmbeddingModel(weights, config, lnc=args.lnc)

# Prepare input
task = "Given a web search query, retrieve relevant passages that answer the query"
sample_text = get_detailed_instruct(task, "What is the capital of China?")

batch_dict = tokenizer(
[sample_text],
padding="max_length",
truncation=True,
max_length=config.max_model_len,
return_tensors="np",
)
input_ids = batch_dict["input_ids"].astype(np.uint32)
attention_mask = batch_dict["attention_mask"].astype(np.float32)

# Warmup (outside profiler)
print(f"\nWarmup ({args.num_warmup} iterations)...")
for _ in range(args.num_warmup):
model.forward(input_ids, attention_mask)

# Profiled forward pass(es)
n = args.num_iterations
print(f"\nRunning profiled forward pass ({n} iteration(s))...")
with KernelProfiler(
core_id=0,
scalene=not args.no_scalene,
output_path=output_path,
) as profiler:
for _ in range(n):
model.forward(input_ids, attention_mask)

result = profiler.result
print(f"\nDone. Captured {len(result.kernel_calls)} kernel calls.")
print(f"Kernel profile saved to: {output_path}")


if __name__ == "__main__":
main()
Loading
Loading