AMDResearch · mawad-amd · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/accordo/accordo/cli.py b/accordo/accordo/cli.py
@@ -9,6 +9,7 @@
         --ref-binary ./ref \\
         --opt-binary ./opt \\
         [--tolerance 1e-6] \\
+        [--atol 1e-6] [--rtol 1e-5] [--equal-nan] \\
         [--timeout 30] \\
         [--working-dir .] \\
         [--kernel-args "input:const float*,output:float*"] \\
@@ -59,7 +60,15 @@ def _build_validate_parser(subparsers: argparse._SubParsersAction) -> None:
         help="Path to optimized executable (single path; use API or a wrapper for argv)",
     )
     p.add_argument(
-        "--tolerance", type=float, default=1e-6, help="Absolute tolerance (default: 1e-6)"
+        "--tolerance", type=float, default=None, help="Legacy alias for --atol (default: 1e-6)"
+    )
+    p.add_argument("--atol", type=float, default=None, help="Absolute tolerance (default: 1e-6)")
+    p.add_argument("--rtol", type=float, default=0.0, help="Relative tolerance (default: 0.0)")
+    p.add_argument(
+        "--equal-nan",
+        action="store_true",
+        default=False,
+        help="Treat NaN values as equal (default: False)",
     )
     p.add_argument(
         "--timeout", type=int, default=30, help="Timeout per snapshot in seconds (default: 30)"
@@ -122,19 +131,23 @@ def _run_validate(args: argparse.Namespace) -> int:
             ref_snapshot,
             opt_snapshot,
             tolerance=args.tolerance,
+            atol=args.atol,
+            rtol=args.rtol,
+            equal_nan=args.equal_nan,
         )
 
         mismatches_serialized = []
         for m in result.mismatches or []:
-            mismatches_serialized.append(
-                {
-                    "arg_index": m.arg_index,
-                    "arg_name": m.arg_name,
-                    "arg_type": m.arg_type,
-                    "max_difference": m.max_difference,
-                    "mean_difference": m.mean_difference,
-                }
-            )
+            entry = {
+                "arg_index": m.arg_index,
+                "arg_name": m.arg_name,
+                "arg_type": m.arg_type,
+                "max_difference": m.max_difference,
+                "mean_difference": m.mean_difference,
+            }
+            if m.dispatch_index is not None:
+                entry["dispatch_index"] = m.dispatch_index
+            mismatches_serialized.append(entry)
 
         output = {
             "is_valid": result.is_valid,

diff --git a/linex/README.md b/linex/README.md
@@ -24,6 +24,29 @@ for line in profiler.source_lines[:5]:
     print(f"  {line.total_cycles:,} cycles ({line.stall_percent:.1f}% stalled)")
 ```
 
+## Distributed Launchers
+
+Linex supports distributed profiling with launchers like `torchrun`, `mpirun`,
+`srun`, and `horovodrun`. Pass the launcher separately so Linex builds the
+correct command order (`launcher rocprofv3 ... -- app`).
+
+```python
+profiler = Linex()
+profiler.profile(
+    command="train.py",
+    launcher="torchrun --nproc_per_node=8",
+    output_dir="linex_sqtt",
+)
+
+print(profiler.distributed_context.global_rank)
+for rank_key, rank_profile in profiler.rank_profiles.items():
+    print(rank_key, len(rank_profile.source_lines))
+```
+
+In distributed mode, Linex writes traces into rank-specific subdirectories
+(`.../rank0000`, `.../rank0001`, ...) to avoid collisions. Rank metadata is
+automatically detected from environment variables set by the launcher.
+
 ## What You Get
 
 **Instruction-level metrics mapped to source lines:**
@@ -66,6 +89,8 @@ profiler = Linex(
 **Properties:**
 - `source_lines` - List[SourceLine] sorted by total_cycles
 - `instructions` - List[InstructionData]
+- `rank_profiles` - Per-rank profiling data for distributed runs
+- `distributed_context` - Detected launcher/rank metadata
 
 ### SourceLine
 

diff --git a/linex/src/linex/__init__.py b/linex/src/linex/__init__.py
@@ -8,7 +8,7 @@
 providing cycle counts and performance metrics per source line.
 """
 
-from .api import Linex, SourceLine, InstructionData
+from .api import InstructionData, Linex, RankProfile, SourceLine
 
 __version__ = "0.1.0"
-__all__ = ["Linex", "SourceLine", "InstructionData"]
+__all__ = ["Linex", "SourceLine", "InstructionData", "RankProfile"]