aws-neuron · vgene · Mar 27, 2026
diff --git a/examples/models/qwen3/launch_distributed_worker.py b/examples/models/qwen3/launch_distributed_worker.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+"""Wrapper that launches torchrun under scalene.
+
+Called by ``test.sh --profile``, not directly. Scalene initializes in this
+process and ``--profile-all`` causes ``redirect_python`` to replace
+``python`` on PATH. When torchrun spawns workers via subprocess, they pick
+up the redirected python and each gets its own dormant scalene instance
+(``--off``). Only rank 0's KernelProfiler calls
+``scalene_profiler.start()``/``stop()`` to activate CPU profiling.
+"""
+
+import sys
+import torch.distributed.run as torchrun_main
+
+if __name__ == "__main__":
+    sys.exit(torchrun_main.main())
diff --git a/examples/models/qwen3/profile.py b/examples/models/qwen3/profile.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""Profile Qwen3-30B with device tracing in a distributed (TP) setting.
+
+Run via test.sh::
+
+    bash test.sh --profile
+"""
+
+import argparse
+import os
+import sys
+
+import torch.distributed as dist
+
+# Add model dir to path so we can import qwen3 modules
+_MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, _MODEL_DIR)
+os.chdir(_MODEL_DIR)
+
+from nkipy.tools.profiler import KernelProfiler
+from qwen3 import Qwen3Model, load_model
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Profile Qwen3 distributed")
+    parser.add_argument("-n", "--max-new-tokens", type=int, default=16)
+    parser.add_argument("prompt", nargs="?", default="The capital of France is")
+    parser.add_argument("--checkpoint", default="./tmp_qwen3-30b-a3b")
+    parser.add_argument("--model", default="Qwen/Qwen3-30B-A3B")
+    parser.add_argument(
+        "--profile-all-ranks",
+        action="store_true",
+        help="Profile all ranks (default: rank 0 only)",
+    )
+    parser.add_argument(
+        "--output-dir", default=None, help="Output directory for profiles"
+    )
+    parser.add_argument(
+        "--no-scalene",
+        action="store_true",
+        help="Disable scalene CPU profiling",
+    )
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    if output_dir is None:
+        output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
+
+    # load_model handles dist init, weight loading, kernel compilation, warmup
+    model, input_ids, tokenizer = load_model(args)
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    target_ranks = list(range(world_size)) if args.profile_all_ranks else [0]
+    output_path = os.path.join(output_dir, "kernel_profile.json")
+
+    # Scalene CPU profiling only on rank 0 (requires process to be launched
+    # under `scalene run --off`). Other ranks skip scalene entirely.
+    use_scalene = (not args.no_scalene) and (rank == 0)
+
+    dist.barrier()
+
+    # Each rank sees its core as local core 0 via NEURON_RT_VISIBLE_CORES
+    with KernelProfiler(
+        core_id=0,
+        scalene=use_scalene,
+        output_path=output_path,
+        target_ranks=target_ranks,
+    ):
+        t = 0
+        for token_id in model.generate(input_ids):
+            t += 1
+            output_id = token_id[0].tolist()
+            if output_id[-1] in [151643, 151645]:
+                break
+            if rank == 0:
+                print(tokenizer.decode(output_id), end="", flush=True)
+
+    if rank == 0:
+        print(f"\nGenerated {t} tokens")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/qwen3/test.sh b/examples/models/qwen3/test.sh
@@ -1,9 +1,21 @@
 #!/bin/bash
 # Test script for Qwen3-30B-A3B on Trainium
-# Usage: bash test.sh
+# Usage: bash test.sh [--profile [--no-scalene]]
 
 set -e
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+
+PROFILE=""
+NO_SCALENE=""
+for arg in "$@"; do
+    case $arg in
+        --profile) PROFILE=1 ;;
+        --no-scalene) NO_SCALENE=1 ;;
+    esac
+done
+
 echo "=========================================="
 echo "Qwen3-30B-A3B Test Script"
 echo "=========================================="
@@ -29,15 +41,62 @@ else
     echo "✓ Weights found at $WEIGHTS_PATH"
 fi
 
-# Step 3: Run example
+# Ensure NRT inspect is disabled (conflicts with SystemTraceSession)
+unset NEURON_RT_INSPECT_ENABLE NEURON_RT_INSPECT_OUTPUT_DIR 2>/dev/null || true
+
+# Step 3: Run
 echo ""
-echo "[3/3] Running Qwen3 inference..."
-echo "=========================================="
+if [ -n "$PROFILE" ]; then
+    echo "[3/3] Running profiled inference (TP=$TP_DEGREE)..."
+    echo "=========================================="
+
+    if [ -z "$NO_SCALENE" ]; then
+        # Scalene on rank 0 + device tracing
+        scalene run --off --profile-all --cpu-only \
+            -o "$SCRIPT_DIR/scalene_profile.json" --- \
+            "$SCRIPT_DIR/launch_distributed_worker.py" \
+            --nproc-per-node "$TP_DEGREE" \
+            "$SCRIPT_DIR/profile.py" \
+            --output-dir "$SCRIPT_DIR" \
+            --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B || true
 
-# Enable async to improve performance
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
-# export NEURON_LOGICAL_NC_CONFIG=1
-torchrun --nproc-per-node "$TP_DEGREE" qwen3.py -n 500 --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B
+        # Merge CPU + device profiles
+        cd "$REPO_DIR"
+        if [ -f "$SCRIPT_DIR/scalene_profile.json" ]; then
+            uv run python -m nkipy.tools.profiler \
+                "$SCRIPT_DIR/scalene_profile.json" \
+                "$SCRIPT_DIR/kernel_profile.json" \
+                "$SCRIPT_DIR/merged_profile.json"
+        else
+            uv run python -m nkipy.tools.profiler --kernel-only \
+                "$SCRIPT_DIR/kernel_profile.json" \
+                "$SCRIPT_DIR/merged_profile.json"
+        fi
+        echo ""
+        echo "View: scalene view $SCRIPT_DIR/merged_profile.json"
+    else
+        # Device tracing only, no scalene
+        export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
+        torchrun --nproc-per-node "$TP_DEGREE" \
+            "$SCRIPT_DIR/profile.py" \
+            --output-dir "$SCRIPT_DIR" --no-scalene \
+            --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B
+
+        cd "$REPO_DIR"
+        uv run python -m nkipy.tools.profiler --kernel-only \
+            "$SCRIPT_DIR/kernel_profile.json" \
+            "$SCRIPT_DIR/merged_profile.json"
+        echo ""
+        echo "View: scalene view $SCRIPT_DIR/merged_profile.json"
+    fi
+else
+    echo "[3/3] Running Qwen3 inference..."
+    echo "=========================================="
+
+    # Enable async to improve performance
+    export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=16
+    torchrun --nproc-per-node "$TP_DEGREE" qwen3.py -n 500 --checkpoint "$WEIGHTS_PATH" --model Qwen/Qwen3-30B-A3B
+fi
 
 echo ""
 echo "=========================================="

diff --git a/examples/models/qwen3_embedding/profile.py b/examples/models/qwen3_embedding/profile.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""Profile Qwen3-Embedding with scalene CPU profiling + device tracing.
+
+Run via test.sh::
+
+    bash test.sh --profile
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Save original CWD for output path resolution, then chdir to model dir
+# (config uses relative paths for weights_dir and build_dir)
+_ORIG_CWD = os.getcwd()
+_MODEL_DIR = str(Path(__file__).resolve().parent)
+sys.path.insert(0, _MODEL_DIR)
+os.chdir(_MODEL_DIR)
+
+from config import get_config
+from embedding_utils import get_detailed_instruct
+from model import Qwen3EmbeddingModel
+from nkipy.tools.profiler import KernelProfiler
+from prepare_weights import load_qwen3_weights
+from transformers import AutoTokenizer
+
+
+def main():
+    # Resolve output path before chdir happens at import time
+    parser = argparse.ArgumentParser(description="Profile Qwen3-Embedding")
+    parser.add_argument(
+        "--model-size",
+        choices=["0.6b", "8b"],
+        default="0.6b",
+        help="Model size (default: 0.6b)",
+    )
+    parser.add_argument("--lnc", type=int, choices=[1, 2], default=2)
+    parser.add_argument("--seq-len", type=int, default=None)
+    parser.add_argument(
+        "--output",
+        default="kernel_profile.json",
+        help="Kernel profile output path (default: kernel_profile.json)",
+    )
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument(
+        "--num-iterations",
+        type=int,
+        default=1,
+        help="Forward passes inside profiled region (default: 1)",
+    )
+    parser.add_argument(
+        "--no-scalene",
+        action="store_true",
+        help="Disable scalene integration",
+    )
+    args = parser.parse_args()
+
+    # Resolve output path relative to original CWD (not the model dir)
+    output_path = Path(args.output)
+    if not output_path.is_absolute():
+        output_path = Path(_ORIG_CWD) / output_path
+
+    # Suppress verbose logging
+    logging.getLogger().setLevel(logging.ERROR)
+
+    overrides = {}
+    if args.seq_len is not None:
+        overrides["max_model_len"] = args.seq_len
+
+    config = get_config(args.model_size, **overrides)
+
+    print(f"Model: {config.model_name}")
+    print(f"Sequence length: {config.max_model_len}")
+    print(f"LNC: {args.lnc}")
+
+    # Load tokenizer and model
+    print("\nLoading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+
+    print("Loading model and compiling kernels...")
+    weights = load_qwen3_weights(config.weights_path)
+    model = Qwen3EmbeddingModel(weights, config, lnc=args.lnc)
+
+    # Prepare input
+    task = "Given a web search query, retrieve relevant passages that answer the query"
+    sample_text = get_detailed_instruct(task, "What is the capital of China?")
+
+    batch_dict = tokenizer(
+        [sample_text],
+        padding="max_length",
+        truncation=True,
+        max_length=config.max_model_len,
+        return_tensors="np",
+    )
+    input_ids = batch_dict["input_ids"].astype(np.uint32)
+    attention_mask = batch_dict["attention_mask"].astype(np.float32)
+
+    # Warmup (outside profiler)
+    print(f"\nWarmup ({args.num_warmup} iterations)...")
+    for _ in range(args.num_warmup):
+        model.forward(input_ids, attention_mask)
+
+    # Profiled forward pass(es)
+    n = args.num_iterations
+    print(f"\nRunning profiled forward pass ({n} iteration(s))...")
+    with KernelProfiler(
+        core_id=0,
+        scalene=not args.no_scalene,
+        output_path=output_path,
+    ) as profiler:
+        for _ in range(n):
+            model.forward(input_ids, attention_mask)
+
+    result = profiler.result
+    print(f"\nDone. Captured {len(result.kernel_calls)} kernel calls.")
+    print(f"Kernel profile saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()