From a7476785b1c15dfc27f4db3b3b4233de7daa8a58 Mon Sep 17 00:00:00 2001 From: Jinsun Yoo Date: Wed, 13 May 2026 15:19:22 +0000 Subject: [PATCH 1/5] Sample Scripts to Collect Traces --- .gitignore | 4 +- .../post_execution/combine_trace.sh | 79 ++++++++++++++ .../post_execution/sample_model.py | 37 +++++++ .../post_execution/simple_computeonly.py | 82 ++++++++++++++ .../post_execution/simple_multirank.py | 102 ++++++++++++++++++ 5 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 trace_collection/post_execution/combine_trace.sh create mode 100644 trace_collection/post_execution/sample_model.py create mode 100644 trace_collection/post_execution/simple_computeonly.py create mode 100644 trace_collection/post_execution/simple_multirank.py diff --git a/.gitignore b/.gitignore index 8be933eb..3ee18eca 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ __pycache__/ *.dot .pyre *et_def.pb.cc -*et_def.pb.h \ No newline at end of file +*et_def.pb.h + +trace_collection/post_execution/traces \ No newline at end of file diff --git a/trace_collection/post_execution/combine_trace.sh b/trace_collection/post_execution/combine_trace.sh new file mode 100644 index 00000000..caf0e755 --- /dev/null +++ b/trace_collection/post_execution/combine_trace.sh @@ -0,0 +1,79 @@ +set -ex pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TRACE_DIR="${TRACE_DIR:-${SCRIPT_DIR}/traces}" + +# --------------------------------------------------------------------------- +# Validate inputs +# --------------------------------------------------------------------------- +if [[ ! -d "${TRACE_DIR}" ]]; then + echo "[ERROR] Trace directory not found: ${TRACE_DIR}" + exit 1 +fi + +# Automatically detect number of ranks from host*.json files +NUM_RANKS=$(ls "${TRACE_DIR}"/host*.json 2>/dev/null | wc -l) +if [[ "${NUM_RANKS}" -eq 0 ]]; then + echo "[ERROR] No host_*.json files found in ${TRACE_DIR}" + exit 1 +fi +echo "[INFO] Found ${NUM_RANKS} rank(s) in ${TRACE_DIR}" + +# --------------------------------------------------------------------------- +# Step 1: chakra_trace_link (host + device → linked JSON in TRACE_DIR) +# --------------------------------------------------------------------------- +echo "" +echo "=== Step 1: chakra_trace_link ===" +for ((rank=0; rank ${LINKED_OUT}" + chakra_trace_link \ + --chakra-host-trace "${HOST_TRACE}" \ + --chakra-device-trace "${DEVICE_TRACE}" \ + --rank "${rank}" \ + --output-file "${LINKED_OUT}" +done +echo "[INFO] All ranks linked." + +# --------------------------------------------------------------------------- +# Step 2: chakra_converter (linked JSON → protobuf .et in TRACE_DIR) +# ASTRA-sim expects files named {prefix}.{npu_id}.et +# e.g. chakra_trace.0.et, chakra_trace.1.et, ... +# --------------------------------------------------------------------------- +echo "" +echo "=== Step 2: chakra_converter ===" +for ((rank=0; rank ${ET_OUT}" + chakra_converter --log-filename /dev/null \ + PyTorch \ + --input "${LINKED_IN}" \ + --output "${ET_OUT}" +done +echo "[INFO] All ranks converted." + +# --------------------------------------------------------------------------- +# Step 3: chakra_jsonizer (protobuf .et to JSON in TRACE_DIR) +# --------------------------------------------------------------------------- +echo "" +echo "=== Step 3: chakra_jsonizer ===" +for ((rank=0; rank ${JSON_OUT}" + chakra_jsonizer \ + --input "${ET_OUT}" \ + --output "${JSON_OUT}" +done +echo "[INFO] All ranks converted." + +echo "" +echo "=== Done ===" +echo " Files: chakra_trace.0.et ... chakra_trace.$((NUM_RANKS-1)).et" +echo "" \ No newline at end of file diff --git a/trace_collection/post_execution/sample_model.py b/trace_collection/post_execution/sample_model.py new file mode 100644 index 00000000..47dc60ef --- /dev/null +++ b/trace_collection/post_execution/sample_model.py @@ -0,0 +1,37 @@ +"""Simple neural network model for post-execution trace collection.""" + +import torch +import torch.nn as nn + + +class SampleModel(nn.Module): + """A simple neural network with linear layer and ReLU activation.""" + + def __init__(self, input_size: int = 1024, hidden_size: int = 256 * 1024, output_size: int = 256) -> None: + """ + Initialize the model. + + Args: + input_size: Size of input features + hidden_size: Size of hidden layer + output_size: Size of output layer + """ + super().__init__() + self.linear = nn.Linear(input_size, hidden_size) + self.linear2 = nn.Linear(hidden_size, output_size) + self.relu = nn.ReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward pass through the model. + + Args: + x: Input tensor + + Returns: + Output tensor after linear layer and ReLU + """ + x = self.linear(x) + x = self.relu(x) + x = self.linear2(x) + return x diff --git a/trace_collection/post_execution/simple_computeonly.py b/trace_collection/post_execution/simple_computeonly.py new file mode 100644 index 00000000..3571a66c --- /dev/null +++ b/trace_collection/post_execution/simple_computeonly.py @@ -0,0 +1,82 @@ +"""Simple script to collect post-execution traces with one rank only(=compute only).""" + +import os + +import torch +import torch.nn as nn +from torch.profiler import ExecutionTraceObserver, profile + +from sample_model import SampleModel + + +def main() -> None: + """Run 10 iterations of forward and backward passes.""" + batch_size = 64 + input_size = 1024 + output_size = 256 + output_path = "traces" + device = "cuda" if torch.cuda.is_available() else "cpu" + + model = SampleModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + criterion = nn.MSELoss() + model.to(device) + + # Initializing Host profiler and Kineto profiler + rank = 0 # Single rank for compute-only + wait_iters = 0 + warmup_iters = 5 + active_iters = 5 + total_steps = wait_iters + warmup_iters + active_iters + + os.makedirs(output_path, exist_ok=True) + + et = ExecutionTraceObserver() + et.register_callback(f"{output_path}/host.{rank}.json") + + def device_trace_handler(prof): + prof.export_chrome_trace(f"{output_path}/device.{rank}.json") + + activities = [torch.profiler.ProfilerActivity.CPU] + if "cuda" in device: + activities.append(torch.profiler.ProfilerActivity.CUDA) + + with profile( + activities=activities, + schedule=torch.profiler.schedule(wait=wait_iters, warmup=warmup_iters, active=active_iters), + on_trace_ready=device_trace_handler, + record_shapes=True, + execution_trace_observer=et, + ) as prof: + print("Starting training loop...") + for step_id in range(total_steps): + # Generate random input data + x = torch.randn(batch_size, input_size, device=device) + target = torch.randn(batch_size, output_size, device=device) + + # Forward pass + output = model(x) + + # Compute loss + loss = criterion(output, target) + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + if "cuda" in device: + # Ensure all CUDA operations are complete before moving to the next step. + torch.cuda.synchronize() + + print(f"Iteration {step_id + 1}/{total_steps}, Loss: {loss.item():.4f}") + + # Mark the end of a step + prof.step() + + et.stop() + et.unregister_callback() + print("Training complete!") + + +if __name__ == "__main__": + main() diff --git a/trace_collection/post_execution/simple_multirank.py b/trace_collection/post_execution/simple_multirank.py new file mode 100644 index 00000000..b5bbbf5d --- /dev/null +++ b/trace_collection/post_execution/simple_multirank.py @@ -0,0 +1,102 @@ +"""Simple script to collect post-execution traces with multiple ranks via torchrun.""" + +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.profiler import ExecutionTraceObserver, profile + +from sample_model import SampleModel + + +def main() -> None: + """Run 10 iterations of forward and backward passes.""" + batch_size = 64 + input_size = 1024 + output_size = 256 + output_path = "traces" + + # torchrun provides these environment variables for each process. + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if torch.cuda.is_available(): + backend = "nccl" + torch.cuda.set_device(local_rank) + device = f"cuda:{local_rank}" + else: + backend = "gloo" + device = "cpu" + + dist.init_process_group(backend=backend) + rank = dist.get_rank() + world_size = dist.get_world_size() + + model = SampleModel() + criterion = nn.MSELoss() + model.to(device) + if torch.cuda.is_available(): + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + else: + model = DDP(model) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + # Initializing Host profiler and Kineto profiler + wait_iters = 0 + warmup_iters = 5 + active_iters = 5 + total_steps = wait_iters + warmup_iters + active_iters + + os.makedirs(output_path, exist_ok=True) + + et = ExecutionTraceObserver() + et.register_callback(f"{output_path}/host.{rank}.json") + + def device_trace_handler(prof): + prof.export_chrome_trace(f"{output_path}/device.{rank}.json") + + activities = [torch.profiler.ProfilerActivity.CPU] + if "cuda" in device: + activities.append(torch.profiler.ProfilerActivity.CUDA) + + with profile( + activities=activities, + schedule=torch.profiler.schedule(wait=wait_iters, warmup=warmup_iters, active=active_iters), + on_trace_ready=device_trace_handler, + record_shapes=True, + execution_trace_observer=et, + ) as prof: + print(f"Starting training loop on rank {rank}/{world_size} using {device}...") + for step_id in range(total_steps): + # Generate random input data + x = torch.randn(batch_size, input_size, device=device) + target = torch.randn(batch_size, output_size, device=device) + + # Forward pass + output = model(x) + + # Compute loss + loss = criterion(output, target) + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + if "cuda" in device: + # Ensure all CUDA operations are complete before moving to the next step. + torch.cuda.synchronize() + + print(f"Rank {rank} Iteration {step_id + 1}/{total_steps}, Loss: {loss.item():.4f}") + + # Mark the end of a step + prof.step() + + et.stop() + et.unregister_callback() + dist.barrier() + dist.destroy_process_group() + print(f"Rank {rank} training complete!") + + +if __name__ == "__main__": + main() From cabe6ddfd557af1c7aeefb0d7ea05a74e2db4df3 Mon Sep 17 00:00:00 2001 From: Jinsun Yoo Date: Fri, 15 May 2026 13:44:22 +0000 Subject: [PATCH 2/5] Newline in gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3ee18eca..6a4cb96b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ __pycache__/ *et_def.pb.cc *et_def.pb.h -trace_collection/post_execution/traces \ No newline at end of file +trace_collection/post_execution/traces + From 97e178e9d634a3663ca0599a559a4b2088e404a4 Mon Sep 17 00:00:00 2001 From: Jinsun Yoo Date: Fri, 15 May 2026 13:45:51 +0000 Subject: [PATCH 3/5] Command line command for multirun --- .../post_execution/simple_multirank.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/trace_collection/post_execution/simple_multirank.py b/trace_collection/post_execution/simple_multirank.py index b5bbbf5d..d1e78854 100644 --- a/trace_collection/post_execution/simple_multirank.py +++ b/trace_collection/post_execution/simple_multirank.py @@ -1,4 +1,21 @@ """Simple script to collect post-execution traces with multiple ranks via torchrun.""" +""" + # When running across multiple nodes + mpirun -np 4 -N 2 \ + torchrun \ + --nnodes=2 \ + --nproc_per_node=2 \ + --rdzv_backend=c10d \ + rdzv_endpoint=:29501 \ + --simple_multirank.py +""" +"""OR""" +""" + # When running on one node + torchrun \ + --nproc_per_node=2 \ + --simple_multirank.py +""" import os From a5bac116fbaea8831efe8df43e0b4d84057dab39 Mon Sep 17 00:00:00 2001 From: Jinsun Yoo Date: Fri, 15 May 2026 13:46:51 +0000 Subject: [PATCH 4/5] Rename onerank and add run script --- .../{simple_computeonly.py => simple_onerank.py} | 3 +++ 1 file changed, 3 insertions(+) rename trace_collection/post_execution/{simple_computeonly.py => simple_onerank.py} (98%) diff --git a/trace_collection/post_execution/simple_computeonly.py b/trace_collection/post_execution/simple_onerank.py similarity index 98% rename from trace_collection/post_execution/simple_computeonly.py rename to trace_collection/post_execution/simple_onerank.py index 3571a66c..839bffbe 100644 --- a/trace_collection/post_execution/simple_computeonly.py +++ b/trace_collection/post_execution/simple_onerank.py @@ -1,4 +1,7 @@ """Simple script to collect post-execution traces with one rank only(=compute only).""" +""" + python simple_onerank.py +""" import os From b66a9c28d15b346485938d12f6ea2062bbbe3ced Mon Sep 17 00:00:00 2001 From: Jinsun Yoo Date: Fri, 15 May 2026 13:49:08 +0000 Subject: [PATCH 5/5] Ruff --- .../post_execution/simple_multirank.py | 25 +++++++++---------- .../post_execution/simple_onerank.py | 15 ++++++----- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/trace_collection/post_execution/simple_multirank.py b/trace_collection/post_execution/simple_multirank.py index d1e78854..c175b8ae 100644 --- a/trace_collection/post_execution/simple_multirank.py +++ b/trace_collection/post_execution/simple_multirank.py @@ -1,4 +1,14 @@ +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +from sample_model import SampleModel +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.profiler import ExecutionTraceObserver, profile + """Simple script to collect post-execution traces with multiple ranks via torchrun.""" + """ # When running across multiple nodes mpirun -np 4 -N 2 \ @@ -17,17 +27,6 @@ --simple_multirank.py """ -import os - -import torch -import torch.distributed as dist -import torch.nn as nn -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.profiler import ExecutionTraceObserver, profile - -from sample_model import SampleModel - - def main() -> None: """Run 10 iterations of forward and backward passes.""" batch_size = 64 @@ -52,7 +51,7 @@ def main() -> None: model = SampleModel() criterion = nn.MSELoss() model.to(device) - if torch.cuda.is_available(): + if torch.cuda.is_available(): # noqa: SIM108 model = DDP(model, device_ids=[local_rank], output_device=local_rank) else: model = DDP(model) @@ -104,7 +103,7 @@ def device_trace_handler(prof): torch.cuda.synchronize() print(f"Rank {rank} Iteration {step_id + 1}/{total_steps}, Loss: {loss.item():.4f}") - + # Mark the end of a step prof.step() diff --git a/trace_collection/post_execution/simple_onerank.py b/trace_collection/post_execution/simple_onerank.py index 839bffbe..cbf8b948 100644 --- a/trace_collection/post_execution/simple_onerank.py +++ b/trace_collection/post_execution/simple_onerank.py @@ -1,16 +1,15 @@ -"""Simple script to collect post-execution traces with one rank only(=compute only).""" -""" - python simple_onerank.py -""" - import os import torch import torch.nn as nn +from sample_model import SampleModel from torch.profiler import ExecutionTraceObserver, profile -from sample_model import SampleModel +"""Simple script to collect post-execution traces with one rank only(=compute only).""" +""" + python simple_onerank.py +""" def main() -> None: """Run 10 iterations of forward and backward passes.""" @@ -26,7 +25,7 @@ def main() -> None: model.to(device) # Initializing Host profiler and Kineto profiler - rank = 0 # Single rank for compute-only + rank = 0 # Single rank for compute-only wait_iters = 0 warmup_iters = 5 active_iters = 5 @@ -72,7 +71,7 @@ def device_trace_handler(prof): torch.cuda.synchronize() print(f"Iteration {step_id + 1}/{total_steps}, Loss: {loss.item():.4f}") - + # Mark the end of a step prof.step()