From 7b36907371280c160e0107300ed95b314966294a Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Fri, 29 May 2026 14:17:16 -0400
Subject: [PATCH 1/8] conformer_rmsd: rewrite to batch-mode benchmark

Replace the per-mol, hardcoded-SMILES benchmark with a batch-mode bench
that:

* Loads a slice of SMILES from a file (via bench_utils.load_smiles).
* Embeds one base conformer per mol in parallel and jitters via the
  shared bench_utils.embed_and_jitter (with add_hs so ETKDG sees a
  chemically reasonable graph).
* Times a single GetConformerRMSMatrixBatch call vs a serial RDKit loop.
* Sweeps confs_per_mol with a single embed run plus _slice_to_confs
  reuse, so every row sees the same molecule selection.
* Validates GPU output against RDKit (per-pair, with a tolerance) before
  timing and aborts on mismatch.
* Honors --rdkit_max_seconds for the RDKit comparison and --no-rdkit /
  --no-nvmolkit for mode selection.
---
 benchmarks/conformer_rmsd_bench.py | 401 +++++++++++++++++++----------
 1 file changed, 271 insertions(+), 130 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index dbd688c5..83e0f63a 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -13,163 +13,304 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Benchmark: GPU vs CPU conformer RMSD matrix computation.
+"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine.
 
-Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's
-CPU GetConformerRMSMatrix across varying conformer counts and molecule sizes.
+Mirrors the sampling strategy used by the FF / TFD benches: load a slice of
+Enamine REAL, embed one ETKDGv3 base conformer per molecule in parallel, and
+derive the remaining conformers by jittering the base structure. The two
+implementations being compared then process the *batch* of molecules:
 
-Usage:
-    python conformer_rmsd_bench.py
-    python conformer_rmsd_bench.py --num-confs 50 100 200 500
-    python conformer_rmsd_bench.py --smiles "CCCCCCCCCC" --num-confs 500
+* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call covers every mol.
+* RDKit CPU:    ``AllChem.GetConformerRMSMatrix`` called in a serial loop,
+  matching the head-to-head convention of the other single-GPU benches
+  (butina, cross_similarity, tfd) which all compare against single-threaded
+  RDKit.
+
+Throughput is reported in molecules/s and pair-RMSDs/s so configurations with
+different conformer counts can be compared apples-to-apples.
 """
 
 import argparse
-import copy
+import csv
+import multiprocessing as mp
+import time
+from pathlib import Path
 
-import numpy as np
 import torch
-from bench_utils import perturb_conformer
+from bench_utils import embed_and_jitter, load_smiles
 from benchmark_timing import time_it
 from rdkit import Chem
-from rdkit.Chem import AllChem, rdDistGeom
+from rdkit.Chem import AllChem
+
+from nvmolkit.conformerRmsd import GetConformerRMSMatrixBatch
 
-from nvmolkit.conformerRmsd import GetConformerRMSMatrix
 
+def prepare_mols(
+    raw_mols: list[Chem.Mol],
+    confs_per_mol: int,
+    seed: int,
+    num_workers: int,
+) -> list[Chem.Mol]:
+    """Embed one base conformer per mol, then perturb to ``confs_per_mol``.
 
-def _numpy_kabsch_rmsd(p, q):
-    """Independent Kabsch RMSD using numpy SVD."""
-    p_c = p - p.mean(axis=0)
-    q_c = q - q.mean(axis=0)
-    H = p_c.T @ q_c
-    S = np.linalg.svd(H, compute_uv=False)
-    d = np.sign(np.linalg.det(H))
-    S[-1] *= d if d != 0.0 else 1.0
-    Sp = np.sum(p_c**2)
-    Sq = np.sum(q_c**2)
-    return np.sqrt(max((Sp + Sq - 2.0 * np.sum(S)) / len(p), 0.0))
+    Thin wrapper over :func:`bench_utils.embed_and_jitter` that enforces
+    confs_per_mol >= 2 (RMSD needs at least one pair) and adds explicit
+    hydrogens during embedding so ETKDG sees a chemically reasonable graph.
+    """
+    if confs_per_mol < 2:
+        raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}")
+    workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2)
+    return embed_and_jitter(
+        raw_mols,
+        confs_per_mol=confs_per_mol,
+        seed=seed,
+        num_workers=workers,
+        add_hs=True,
+        min_atoms=2,
+        desc=f"Embed + perturb ({confs_per_mol} confs)",
+    )
 
 
-def benchmark_cpu(mol, n_warmup=1, n_iter=5):
-    """Benchmark RDKit CPU GetConformerRMSMatrix.
+def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]:
+    """One RDKit timing iteration: serial loop, returns ``(elapsed_s, n_done)``.
 
-    Deep-copies the molecule before each call because GetConformerRMSMatrix
-    modifies conformer coordinates in-place during alignment; reusing the
-    same molecule would measure already-aligned conformers and understate
-    the true CPU cost.
+    When ``max_seconds > 0`` the loop breaks after the deadline is exceeded;
+    callers compute throughput as ``n_done / elapsed_s`` so a truncated run
+    is still extrapolated to a fair pairs/s figure. ``GetConformerRMSMatrix``
+    mutates conformer coordinates in-place during Kabsch alignment, so each
+    call gets a fresh deserialization.
     """
-    result = time_it(
-        lambda: AllChem.GetConformerRMSMatrix(copy.deepcopy(mol), prealigned=False), runs=n_iter, warmups=n_warmup
-    )
-    return result.median_s
+    deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None
+    start = time.perf_counter()
+    n_done = 0
+    for mol_bytes in payloads:
+        mol = Chem.Mol(mol_bytes)
+        AllChem.GetConformerRMSMatrix(mol, prealigned=False)
+        n_done += 1
+        if deadline is not None and time.perf_counter() >= deadline:
+            break
+    return time.perf_counter() - start, n_done
 
 
-def benchmark_gpu(mol, n_warmup=2, n_iter=10):
-    """Benchmark nvMolKit GPU GetConformerRMSMatrix."""
-    result = time_it(
-        lambda: GetConformerRMSMatrix(mol, prealigned=False), runs=n_iter, warmups=n_warmup, gpu_sync=True
-    )
-    return result.median_s
-
-
-def run_benchmark(smiles, num_confs_list, seed=42):
-    """Run CPU vs GPU benchmark for a molecule at various conformer counts."""
-    mol_base = Chem.AddHs(Chem.MolFromSmiles(smiles))
-    no_h_base = Chem.RemoveHs(Chem.AddHs(Chem.MolFromSmiles(smiles)))
-    n_atoms = no_h_base.GetNumAtoms()
-
-    print(f"\nMolecule: {smiles}  ({n_atoms} heavy atoms)")
-    print(f"{'Confs':>8}  {'Pairs':>10}  {'CPU (ms)':>10}  {'GPU (ms)':>10}  {'Speedup':>8}  {'Match':>6}")
-    print("-" * 70)
-
-    for num_confs in num_confs_list:
-        mol = Chem.RWMol(mol_base)
-        mol.RemoveAllConformers()
-        params = rdDistGeom.ETKDGv3()
-        params.randomSeed = seed
-        params.useRandomCoords = True
-        if rdDistGeom.EmbedMolecule(mol, params=params) < 0:
-            print(f"{num_confs:>8}  {'skipped (embedding failed)':>50}")
-            continue
-        if num_confs < 2:
-            print(f"{num_confs:>8}  {'skipped (need >= 2 confs for RMSD)':>50}")
-            continue
-        base_conf_id = mol.GetConformer().GetId()
-        for conf_idx in range(1, num_confs):
-            new_conf = Chem.Conformer(mol.GetConformer(base_conf_id))
-            perturb_conformer(new_conf, seed=seed + conf_idx)
-            mol.AddConformer(new_conf, assignId=True)
-        perturb_conformer(mol.GetConformer(base_conf_id), seed=seed)
-        actual_confs = mol.GetNumConformers()
-
-        no_h = Chem.RemoveHs(mol)
-        n_pairs = actual_confs * (actual_confs - 1) // 2
-
-        # CPU benchmark
-        cpu_time = benchmark_cpu(no_h)
-
-        # GPU benchmark
-        gpu_time = benchmark_gpu(no_h)
-
-        # Correctness check against numpy Kabsch SVD (sample up to 500 pairs)
-        gpu_result = GetConformerRMSMatrix(no_h, prealigned=False)
-        torch.cuda.synchronize()
-        gpu_rms = gpu_result.numpy().tolist()
-
-        confs = no_h.GetConformers()
-        coords = [np.array(c.GetPositions()) for c in confs]
-        max_diff = 0.0
-        count = 0
-        max_checks = min(500, n_pairs)
-        for i in range(len(confs)):
-            for j in range(i):
-                idx = i * (i - 1) // 2 + j
-                ref = _numpy_kabsch_rmsd(coords[i], coords[j])
-                max_diff = max(max_diff, abs(gpu_rms[idx] - ref))
-                count += 1
-                if count >= max_checks:
-                    break
-            if count >= max_checks:
-                break
-        match_ok = max_diff < 0.05
-
-        speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf")
+def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
+    results = GetConformerRMSMatrixBatch(mols, prealigned=False)
+    for result in results:
+        result.torch()
+    torch.cuda.synchronize()
+
+
+def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
+    """Diff GPU RMSD matrices against RDKit on the first ``num_check`` mols.
+
+    Untimed; runs once before the sweep. Each pair of conformers in each mol
+    is compared element-wise; mismatches abort the benchmark so we never
+    publish a number for a broken kernel.
+    """
+    subset = mols[:num_check]
+    if not subset:
+        return
+    print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})")
+    gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False)
+    torch.cuda.synchronize()
+    max_abs_diff = 0.0
+    for mol_idx, mol in enumerate(subset):
+        rdkit_mol = Chem.Mol(mol.ToBinary())
+        rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False)
+        gpu_rms = gpu_results[mol_idx].numpy().tolist()
+        if len(gpu_rms) != len(rdkit_rms):
+            raise RuntimeError(
+                f"validation: mol {mol_idx} pair count mismatch "
+                f"(gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
+            )
+        for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)):
+            diff = abs(float(gpu_val) - float(rdkit_val))
+            if diff > tol:
+                raise RuntimeError(
+                    f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} "
+                    f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})"
+                )
+            if diff > max_abs_diff:
+                max_abs_diff = diff
+    print(f"  OK (max abs diff {max_abs_diff:.5f})")
+
+
+def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]:
+    """Return copies of ``mols`` keeping only the first ``target`` conformers each.
+
+    The shared base set is prepared once at the maximum conformer count; this
+    helper produces the per-sweep-point view without re-embedding so every
+    sweep row sees the exact same molecule selection and base geometries.
+    """
+    out: list[Chem.Mol] = []
+    for mol in mols:
+        copy_mol = Chem.Mol(mol, True)  # quickCopy: keeps graph, drops conformers
+        confs = list(mol.GetConformers())[:target]
+        for conf in confs:
+            copy_mol.AddConformer(Chem.Conformer(conf), assignId=True)
+        out.append(copy_mol)
+    return out
+
 
+def run(
+    smiles_path: str,
+    num_mols: int,
+    confs_per_mol_list: list[int],
+    seed: int,
+    prep_workers: int,
+    rdkit_max_seconds: float,
+    validate_count: int,
+    validate_tol: float,
+    no_rdkit: bool,
+    no_nvmolkit: bool,
+    output: str | None,
+) -> None:
+    if no_rdkit and no_nvmolkit:
+        raise ValueError("cannot disable both RDKit and nvMolKit")
+    if any(count < 2 for count in confs_per_mol_list):
+        raise ValueError("every --confs_per_mol value must be >= 2")
+
+    if not no_nvmolkit:
+        print(f"GPU: {torch.cuda.get_device_name(0)}  CUDA: {torch.version.cuda}")
+    print(f"Loading SMILES from {smiles_path} (target {num_mols} mols)")
+    raw = load_smiles(smiles_path, max_count=num_mols, sanitize=True, seed=seed)
+
+    max_confs = max(confs_per_mol_list)
+    print(f"Preparing {len(raw)} mols x {max_confs} conformers (perturb-from-1-embed)")
+    base_mols = prepare_mols(raw, confs_per_mol=max_confs, seed=seed, num_workers=prep_workers)
+    if len(base_mols) > num_mols:
+        base_mols = base_mols[:num_mols]
+    if not base_mols:
+        raise RuntimeError("no molecules survived embedding")
+
+    avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols)
+    print(f"  {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol")
+    if validate_count > 0 and not no_rdkit and not no_nvmolkit:
+        validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol)
+    elif validate_count > 0:
+        print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)")
+
+    print(f"\nSweeping confs_per_mol: {confs_per_mol_list}")
+
+    rows: list[dict[str, float | int | str]] = []
+    for target_confs in sorted(confs_per_mol_list):
+        mols = _slice_to_confs(base_mols, target_confs)
+        actual_confs = [mol.GetNumConformers() for mol in mols]
+        total_pairs = sum(count * (count - 1) // 2 for count in actual_confs)
         print(
-            f"{actual_confs:>8}  {n_pairs:>10}  {cpu_time * 1000:>10.2f}  "
-            f"{gpu_time * 1000:>10.2f}  {speedup:>7.1f}x  "
-            f"{'OK' if match_ok else f'FAIL ({max_diff:.4f})':>6}"
+            f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, "
+            f"{total_pairs} RMSD pairs ==="
         )
 
+        row: dict[str, float | int | str] = {
+            "num_mols": len(mols),
+            "confs_per_mol": target_confs,
+            "total_pairs": total_pairs,
+            "avg_heavy_atoms": avg_atoms,
+        }
+
+        rdkit_mols_per_s: float | None = None
+        rdkit_pairs_per_s: float | None = None
+        if not no_rdkit:
+            payloads = [mol.ToBinary() for mol in mols]
+            cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap"
+            print(f"  RDKit CPU (single-threaded, {cap_label}):")
+            bench_rdkit_batch(payloads, rdkit_max_seconds)  # warmup
+            samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)]
+            samples.sort(key=lambda pair: pair[0] / max(pair[1], 1))
+            rdkit_time_s, rdkit_done = samples[len(samples) // 2]
+            pair_count_done = sum(
+                count * (count - 1) // 2 for count in actual_confs[:rdkit_done]
+            )
+            rdkit_mols_per_s = rdkit_done / rdkit_time_s
+            rdkit_pairs_per_s = pair_count_done / rdkit_time_s
+            truncated = rdkit_done < len(mols)
+            suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else ""
+            print(
+                f"    median wall: {rdkit_time_s * 1000:.1f} ms over {rdkit_done} mols  "
+                f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}"
+            )
+            row["rdkit_median_s"] = rdkit_time_s
+            row["rdkit_mols_processed"] = rdkit_done
+            row["rdkit_truncated"] = int(truncated)
+            row["rdkit_mols_per_s"] = rdkit_mols_per_s
+            row["rdkit_pairs_per_s"] = rdkit_pairs_per_s
+
+        gpu_pairs_per_s: float | None = None
+        if not no_nvmolkit:
+            print("  nvMolKit GPU (batched):")
+            result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True)
+            gpu_time_s = result.median_s
+            gpu_pairs_per_s = total_pairs / gpu_time_s
+            print(
+                f"    median wall: {gpu_time_s * 1000:.1f} ms  "
+                f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)"
+            )
+            row["gpu_median_s"] = gpu_time_s
+            row["gpu_mols_per_s"] = len(mols) / gpu_time_s
+            row["gpu_pairs_per_s"] = gpu_pairs_per_s
+
+        if rdkit_pairs_per_s is not None and gpu_pairs_per_s is not None:
+            row["speedup"] = gpu_pairs_per_s / rdkit_pairs_per_s
+            print(f"  GPU speedup vs single-threaded RDKit (pairs/s): {row['speedup']:.1f}x")
+
+        rows.append(row)
+
+    if output and rows:
+        out_path = Path(output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        fieldnames: list[str] = []
+        for row in rows:
+            for key in row:
+                if key not in fieldnames:
+                    fieldnames.append(key)
+        with out_path.open("w", newline="") as fh:
+            writer = csv.DictWriter(fh, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        print(f"\nWrote {out_path}")
+
 
 def main():
-    parser = argparse.ArgumentParser(description="Benchmark GPU vs CPU conformer RMSD matrix")
-    parser.add_argument(
-        "--smiles",
-        nargs="+",
-        default=[
-            "CC(=O)Oc1ccccc1C(=O)O",  # aspirin (13 HA)
-            "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",  # celecoxib (24 HA)
-            "C=CC(=O)Nc1cc(OC)c(Nc2nccc(-c3cn(C)c4ccccc34)n2)cc1N(C)CCN(C)C",  # osimertinib (33 HA)
-            "CC(C)CC1NC(=O)C(CC(=O)O)NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(CO)NC(=O)C(Cc2c[nH]c3ccccc23)NC1=O",  # cyclic pentapeptide (~48 HA)
-        ],
-        help="SMILES strings to benchmark",
-    )
-    parser.add_argument(
-        "--num-confs",
-        nargs="+",
-        type=int,
-        default=[10, 50, 100, 200, 500],
-        help="Number of conformers to generate",
-    )
+    parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine")
+    parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file")
+    parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample")
+    parser.add_argument("--confs_per_mol", type=int, nargs="+", default=[10, 25, 50, 100, 200],
+                        help="Conformers-per-molecule sweep points (each >=2)")
+    parser.add_argument("--prep_workers", type=int, default=0,
+                        help="Workers for the embed-and-perturb prep step (0 = half of CPUs)")
+    parser.add_argument("--rdkit_max_seconds", type=float, default=0.0,
+                        help="Per-iteration wall-clock cap on the RDKit comparison "
+                             "(0 = no cap). When exceeded, throughput is reported "
+                             "over the molecules actually processed.")
+    parser.add_argument("--validate_count", type=int, default=8,
+                        help="Number of mols to compare GPU vs RDKit before timing "
+                             "(0 disables; requires both backends enabled)")
+    parser.add_argument("--validate_tol", type=float, default=0.05,
+                        help="Absolute tolerance (Angstroms) for per-pair RMSD diff")
+    parser.add_argument("--no_validate", action="store_true",
+                        help="Skip the GPU-vs-RDKit correctness check")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
+    parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark")
+    parser.add_argument("--no-nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark")
     args = parser.parse_args()
 
-    device_name = torch.cuda.get_device_name(0)
-    print(f"GPU: {device_name}")
-    print(f"CUDA: {torch.version.cuda}")
+    if args.no_rdkit and args.no_nvmolkit:
+        parser.error("cannot pass both --no-rdkit and --no-nvmolkit")
 
-    for smiles in args.smiles:
-        run_benchmark(smiles, args.num_confs)
+    run(
+        smiles_path=args.smiles,
+        num_mols=args.num_mols,
+        confs_per_mol_list=args.confs_per_mol,
+        seed=args.seed,
+        prep_workers=args.prep_workers,
+        rdkit_max_seconds=args.rdkit_max_seconds,
+        validate_count=0 if args.no_validate else args.validate_count,
+        validate_tol=args.validate_tol,
+        no_rdkit=args.no_rdkit,
+        no_nvmolkit=args.no_nvmolkit,
+        output=args.output,
+    )
 
     print("\nDone.")
 

From 390f9ff6abd696e43340af4f0b03fe5fdfd1caaa Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Fri, 29 May 2026 17:35:53 -0400
Subject: [PATCH 2/8] conformer_rmsd: trim docstrings to drop cross-bench
 rationale and narration

---
 benchmarks/conformer_rmsd_bench.py | 52 +++++++++++-------------------
 1 file changed, 18 insertions(+), 34 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index 83e0f63a..983d00b8 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -15,19 +15,14 @@
 
 """Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine.
 
-Mirrors the sampling strategy used by the FF / TFD benches: load a slice of
-Enamine REAL, embed one ETKDGv3 base conformer per molecule in parallel, and
-derive the remaining conformers by jittering the base structure. The two
-implementations being compared then process the *batch* of molecules:
-
-* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call covers every mol.
-* RDKit CPU:    ``AllChem.GetConformerRMSMatrix`` called in a serial loop,
-  matching the head-to-head convention of the other single-GPU benches
-  (butina, cross_similarity, tfd) which all compare against single-threaded
-  RDKit.
-
-Throughput is reported in molecules/s and pair-RMSDs/s so configurations with
-different conformer counts can be compared apples-to-apples.
+Loads a slice of Enamine REAL, embeds one ETKDGv3 base conformer per molecule
+and jitters it to produce the remaining conformers, then compares two
+implementations over the batch of molecules:
+
+* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call.
+* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` in a serial loop.
+
+Throughput is reported in molecules/s and pair-RMSDs/s.
 """
 
 import argparse
@@ -53,9 +48,7 @@ def prepare_mols(
 ) -> list[Chem.Mol]:
     """Embed one base conformer per mol, then perturb to ``confs_per_mol``.
 
-    Thin wrapper over :func:`bench_utils.embed_and_jitter` that enforces
-    confs_per_mol >= 2 (RMSD needs at least one pair) and adds explicit
-    hydrogens during embedding so ETKDG sees a chemically reasonable graph.
+    Requires ``confs_per_mol >= 2`` since RMSD needs at least one conformer pair.
     """
     if confs_per_mol < 2:
         raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}")
@@ -72,13 +65,11 @@ def prepare_mols(
 
 
 def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]:
-    """One RDKit timing iteration: serial loop, returns ``(elapsed_s, n_done)``.
+    """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``.
 
-    When ``max_seconds > 0`` the loop breaks after the deadline is exceeded;
-    callers compute throughput as ``n_done / elapsed_s`` so a truncated run
-    is still extrapolated to a fair pairs/s figure. ``GetConformerRMSMatrix``
-    mutates conformer coordinates in-place during Kabsch alignment, so each
-    call gets a fresh deserialization.
+    Stops once ``max_seconds`` is exceeded (``0`` means no cap). Each iteration
+    deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer
+    coordinates in-place during Kabsch alignment.
     """
     deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None
     start = time.perf_counter()
@@ -100,11 +91,9 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
 
 
 def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
-    """Diff GPU RMSD matrices against RDKit on the first ``num_check`` mols.
+    """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols.
 
-    Untimed; runs once before the sweep. Each pair of conformers in each mol
-    is compared element-wise; mismatches abort the benchmark so we never
-    publish a number for a broken kernel.
+    Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``.
     """
     subset = mols[:num_check]
     if not subset:
@@ -135,12 +124,7 @@ def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
 
 
 def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]:
-    """Return copies of ``mols`` keeping only the first ``target`` conformers each.
-
-    The shared base set is prepared once at the maximum conformer count; this
-    helper produces the per-sweep-point view without re-embedding so every
-    sweep row sees the exact same molecule selection and base geometries.
-    """
+    """Return copies of ``mols`` keeping only the first ``target`` conformers each."""
     out: list[Chem.Mol] = []
     for mol in mols:
         copy_mol = Chem.Mol(mol, True)  # quickCopy: keeps graph, drops conformers
@@ -263,8 +247,8 @@ def run(
             for key in row:
                 if key not in fieldnames:
                     fieldnames.append(key)
-        with out_path.open("w", newline="") as fh:
-            writer = csv.DictWriter(fh, fieldnames=fieldnames)
+        with out_path.open("w", newline="") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()
             writer.writerows(rows)
         print(f"\nWrote {out_path}")

From a80a9d1a19b3d7d6e35d9d93a042c873a5846f91 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 08:39:10 -0400
Subject: [PATCH 3/8] conformer_rmsd: reflow long argparse calls onto single
 lines

Cosmetic only; matches the formatting style used in adjacent benches.
---
 benchmarks/conformer_rmsd_bench.py | 55 +++++++++++++++++-------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index 983d00b8..b6f113c2 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -108,8 +108,7 @@ def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
         gpu_rms = gpu_results[mol_idx].numpy().tolist()
         if len(gpu_rms) != len(rdkit_rms):
             raise RuntimeError(
-                f"validation: mol {mol_idx} pair count mismatch "
-                f"(gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
+                f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
             )
         for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)):
             diff = abs(float(gpu_val) - float(rdkit_val))
@@ -180,10 +179,7 @@ def run(
         mols = _slice_to_confs(base_mols, target_confs)
         actual_confs = [mol.GetNumConformers() for mol in mols]
         total_pairs = sum(count * (count - 1) // 2 for count in actual_confs)
-        print(
-            f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, "
-            f"{total_pairs} RMSD pairs ==="
-        )
+        print(f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, {total_pairs} RMSD pairs ===")
 
         row: dict[str, float | int | str] = {
             "num_mols": len(mols),
@@ -202,9 +198,7 @@ def run(
             samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)]
             samples.sort(key=lambda pair: pair[0] / max(pair[1], 1))
             rdkit_time_s, rdkit_done = samples[len(samples) // 2]
-            pair_count_done = sum(
-                count * (count - 1) // 2 for count in actual_confs[:rdkit_done]
-            )
+            pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done])
             rdkit_mols_per_s = rdkit_done / rdkit_time_s
             rdkit_pairs_per_s = pair_count_done / rdkit_time_s
             truncated = rdkit_done < len(mols)
@@ -258,21 +252,34 @@ def main():
     parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine")
     parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file")
     parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample")
-    parser.add_argument("--confs_per_mol", type=int, nargs="+", default=[10, 25, 50, 100, 200],
-                        help="Conformers-per-molecule sweep points (each >=2)")
-    parser.add_argument("--prep_workers", type=int, default=0,
-                        help="Workers for the embed-and-perturb prep step (0 = half of CPUs)")
-    parser.add_argument("--rdkit_max_seconds", type=float, default=0.0,
-                        help="Per-iteration wall-clock cap on the RDKit comparison "
-                             "(0 = no cap). When exceeded, throughput is reported "
-                             "over the molecules actually processed.")
-    parser.add_argument("--validate_count", type=int, default=8,
-                        help="Number of mols to compare GPU vs RDKit before timing "
-                             "(0 disables; requires both backends enabled)")
-    parser.add_argument("--validate_tol", type=float, default=0.05,
-                        help="Absolute tolerance (Angstroms) for per-pair RMSD diff")
-    parser.add_argument("--no_validate", action="store_true",
-                        help="Skip the GPU-vs-RDKit correctness check")
+    parser.add_argument(
+        "--confs_per_mol",
+        type=int,
+        nargs="+",
+        default=[10, 25, 50, 100, 200],
+        help="Conformers-per-molecule sweep points (each >=2)",
+    )
+    parser.add_argument(
+        "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)"
+    )
+    parser.add_argument(
+        "--rdkit_max_seconds",
+        type=float,
+        default=0.0,
+        help="Per-iteration wall-clock cap on the RDKit comparison "
+        "(0 = no cap). When exceeded, throughput is reported "
+        "over the molecules actually processed.",
+    )
+    parser.add_argument(
+        "--validate_count",
+        type=int,
+        default=8,
+        help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)",
+    )
+    parser.add_argument(
+        "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff"
+    )
+    parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check")
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
     parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark")

From 5ba61fc1610f1f2ba7985ccc5ee2429c78471a77 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 08:44:33 -0400
Subject: [PATCH 4/8] Fix some conf benchmark things

---
 benchmarks/conformer_rmsd_bench.py | 31 ++++++++++--------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index b6f113c2..324e462c 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -15,14 +15,9 @@
 
 """Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine.
 
-Loads a slice of Enamine REAL, embeds one ETKDGv3 base conformer per molecule
-and jitters it to produce the remaining conformers, then compares two
-implementations over the batch of molecules:
+Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's
+CPU GetConformerRMSMatrix across varying conformer counts.
 
-* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call.
-* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` in a serial loop.
-
-Throughput is reported in molecules/s and pair-RMSDs/s.
 """
 
 import argparse
@@ -32,7 +27,7 @@
 from pathlib import Path
 
 import torch
-from bench_utils import embed_and_jitter, load_smiles
+from bench_utils import Deadline, add_rdkit_max_seconds_arg, embed_and_jitter, load_smiles
 from benchmark_timing import time_it
 from rdkit import Chem
 from rdkit.Chem import AllChem
@@ -71,22 +66,20 @@ def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float,
     deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer
     coordinates in-place during Kabsch alignment.
     """
-    deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None
+    deadline = Deadline(max_seconds)
     start = time.perf_counter()
     n_done = 0
     for mol_bytes in payloads:
         mol = Chem.Mol(mol_bytes)
         AllChem.GetConformerRMSMatrix(mol, prealigned=False)
         n_done += 1
-        if deadline is not None and time.perf_counter() >= deadline:
+        if deadline.expired():
             break
     return time.perf_counter() - start, n_done
 
 
 def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
     results = GetConformerRMSMatrixBatch(mols, prealigned=False)
-    for result in results:
-        result.torch()
     torch.cuda.synchronize()
 
 
@@ -249,8 +242,8 @@ def run(
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine")
-    parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file")
+    parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark")
+    parser.add_argument("--smiles", required=True, help="Path to smiles file")
     parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample")
     parser.add_argument(
         "--confs_per_mol",
@@ -262,13 +255,9 @@ def main():
     parser.add_argument(
         "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)"
     )
-    parser.add_argument(
-        "--rdkit_max_seconds",
-        type=float,
-        default=0.0,
-        help="Per-iteration wall-clock cap on the RDKit comparison "
-        "(0 = no cap). When exceeded, throughput is reported "
-        "over the molecules actually processed.",
+    add_rdkit_max_seconds_arg(
+        parser,
+        extra_help="The cap applies per timing iteration and truncates at a molecule boundary.",
     )
     parser.add_argument(
         "--validate_count",

From 786d47ecfb716777158705dfb6319b980be1c6d4 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 08:59:47 -0400
Subject: [PATCH 5/8] Remove some stuff

---
 benchmarks/conformer_rmsd_bench.py | 71 ++++++------------------------
 1 file changed, 14 insertions(+), 57 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index 324e462c..c734eb96 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -23,6 +23,7 @@
 import argparse
 import csv
 import multiprocessing as mp
+import statistics
 import time
 from pathlib import Path
 
@@ -62,15 +63,14 @@ def prepare_mols(
 def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]:
     """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``.
 
-    Stops once ``max_seconds`` is exceeded (``0`` means no cap). Each iteration
-    deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer
-    coordinates in-place during Kabsch alignment.
+    Stops once ``max_seconds`` is exceeded (``0`` means no cap). A fresh mol is
+    built per call because ``GetConformerRMSMatrix`` aligns conformers in place.
     """
+    mols = [Chem.Mol(mol_bytes) for mol_bytes in payloads]
     deadline = Deadline(max_seconds)
     start = time.perf_counter()
     n_done = 0
-    for mol_bytes in payloads:
-        mol = Chem.Mol(mol_bytes)
+    for mol in mols:
         AllChem.GetConformerRMSMatrix(mol, prealigned=False)
         n_done += 1
         if deadline.expired():
@@ -83,38 +83,6 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
     torch.cuda.synchronize()
 
 
-def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
-    """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols.
-
-    Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``.
-    """
-    subset = mols[:num_check]
-    if not subset:
-        return
-    print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})")
-    gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False)
-    torch.cuda.synchronize()
-    max_abs_diff = 0.0
-    for mol_idx, mol in enumerate(subset):
-        rdkit_mol = Chem.Mol(mol.ToBinary())
-        rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False)
-        gpu_rms = gpu_results[mol_idx].numpy().tolist()
-        if len(gpu_rms) != len(rdkit_rms):
-            raise RuntimeError(
-                f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
-            )
-        for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)):
-            diff = abs(float(gpu_val) - float(rdkit_val))
-            if diff > tol:
-                raise RuntimeError(
-                    f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} "
-                    f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})"
-                )
-            if diff > max_abs_diff:
-                max_abs_diff = diff
-    print(f"  OK (max abs diff {max_abs_diff:.5f})")
-
-
 def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]:
     """Return copies of ``mols`` keeping only the first ``target`` conformers each."""
     out: list[Chem.Mol] = []
@@ -134,8 +102,6 @@ def run(
     seed: int,
     prep_workers: int,
     rdkit_max_seconds: float,
-    validate_count: int,
-    validate_tol: float,
     no_rdkit: bool,
     no_nvmolkit: bool,
     output: str | None,
@@ -160,10 +126,6 @@ def run(
 
     avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols)
     print(f"  {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol")
-    if validate_count > 0 and not no_rdkit and not no_nvmolkit:
-        validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol)
-    elif validate_count > 0:
-        print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)")
 
     print(f"\nSweeping confs_per_mol: {confs_per_mol_list}")
 
@@ -187,20 +149,25 @@ def run(
             payloads = [mol.ToBinary() for mol in mols]
             cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap"
             print(f"  RDKit CPU (single-threaded, {cap_label}):")
+            # TODO: replace this hand-rolled warmup/sample/median loop with time_it once
+            # time_it can consume a Deadline and truncate a run mid-workload.
+            # https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/186
             bench_rdkit_batch(payloads, rdkit_max_seconds)  # warmup
             samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)]
             samples.sort(key=lambda pair: pair[0] / max(pair[1], 1))
             rdkit_time_s, rdkit_done = samples[len(samples) // 2]
+            rdkit_std_s = statistics.stdev([elapsed for elapsed, _ in samples]) if len(samples) > 1 else 0.0
             pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done])
             rdkit_mols_per_s = rdkit_done / rdkit_time_s
             rdkit_pairs_per_s = pair_count_done / rdkit_time_s
             truncated = rdkit_done < len(mols)
             suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else ""
             print(
-                f"    median wall: {rdkit_time_s * 1000:.1f} ms over {rdkit_done} mols  "
+                f"    median wall: {rdkit_time_s * 1000:.1f} +/- {rdkit_std_s * 1000:.1f} ms over {rdkit_done} mols  "
                 f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}"
             )
             row["rdkit_median_s"] = rdkit_time_s
+            row["rdkit_std_s"] = rdkit_std_s
             row["rdkit_mols_processed"] = rdkit_done
             row["rdkit_truncated"] = int(truncated)
             row["rdkit_mols_per_s"] = rdkit_mols_per_s
@@ -211,12 +178,14 @@ def run(
             print("  nvMolKit GPU (batched):")
             result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True)
             gpu_time_s = result.median_s
+            gpu_std_s = result.std_ms / 1000.0
             gpu_pairs_per_s = total_pairs / gpu_time_s
             print(
-                f"    median wall: {gpu_time_s * 1000:.1f} ms  "
+                f"    median wall: {gpu_time_s * 1000:.1f} +/- {gpu_std_s * 1000:.1f} ms  "
                 f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)"
             )
             row["gpu_median_s"] = gpu_time_s
+            row["gpu_std_s"] = gpu_std_s
             row["gpu_mols_per_s"] = len(mols) / gpu_time_s
             row["gpu_pairs_per_s"] = gpu_pairs_per_s
 
@@ -259,16 +228,6 @@ def main():
         parser,
         extra_help="The cap applies per timing iteration and truncates at a molecule boundary.",
     )
-    parser.add_argument(
-        "--validate_count",
-        type=int,
-        default=8,
-        help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)",
-    )
-    parser.add_argument(
-        "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff"
-    )
-    parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check")
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
     parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark")
@@ -285,8 +244,6 @@ def main():
         seed=args.seed,
         prep_workers=args.prep_workers,
         rdkit_max_seconds=args.rdkit_max_seconds,
-        validate_count=0 if args.no_validate else args.validate_count,
-        validate_tol=args.validate_tol,
         no_rdkit=args.no_rdkit,
         no_nvmolkit=args.no_nvmolkit,
         output=args.output,

From 34a284b1cdad48965ae8dd6997f9fc1af70d5c08 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 09:03:25 -0400
Subject: [PATCH 6/8] Fix minor things

---
 benchmarks/conformer_rmsd_bench.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index c734eb96..92e0e91f 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -13,11 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine.
+"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD.
 
 Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's
 CPU GetConformerRMSMatrix across varying conformer counts.
-
 """
 
 import argparse
@@ -42,12 +41,7 @@ def prepare_mols(
     seed: int,
     num_workers: int,
 ) -> list[Chem.Mol]:
-    """Embed one base conformer per mol, then perturb to ``confs_per_mol``.
-
-    Requires ``confs_per_mol >= 2`` since RMSD needs at least one conformer pair.
-    """
-    if confs_per_mol < 2:
-        raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}")
+    """Embed one base conformer per mol, then perturb to ``confs_per_mol``."""
     workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2)
     return embed_and_jitter(
         raw_mols,
@@ -230,12 +224,12 @@ def main():
     )
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
-    parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark")
-    parser.add_argument("--no-nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark")
+    parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark")
+    parser.add_argument("--no_nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark")
     args = parser.parse_args()
 
     if args.no_rdkit and args.no_nvmolkit:
-        parser.error("cannot pass both --no-rdkit and --no-nvmolkit")
+        parser.error("cannot pass both --no_rdkit and --no_nvmolkit")
 
     run(
         smiles_path=args.smiles,

From 3700a3c60352a7e6e7dd3b287509247bb9f283b3 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 09:07:21 -0400
Subject: [PATCH 7/8] Add validation back in

---
 benchmarks/conformer_rmsd_bench.py | 50 ++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index 92e0e91f..ee8d49ff 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -77,6 +77,38 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
     torch.cuda.synchronize()
 
 
+def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
+    """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols.
+
+    Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``.
+    """
+    subset = mols[:num_check]
+    if not subset:
+        return
+    print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})")
+    gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False)
+    torch.cuda.synchronize()
+    max_abs_diff = 0.0
+    for mol_idx, mol in enumerate(subset):
+        rdkit_mol = Chem.Mol(mol.ToBinary())
+        rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False)
+        gpu_rms = gpu_results[mol_idx].numpy().tolist()
+        if len(gpu_rms) != len(rdkit_rms):
+            raise RuntimeError(
+                f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
+            )
+        for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)):
+            diff = abs(float(gpu_val) - float(rdkit_val))
+            if diff > tol:
+                raise RuntimeError(
+                    f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} "
+                    f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})"
+                )
+            if diff > max_abs_diff:
+                max_abs_diff = diff
+    print(f"  OK (max abs diff {max_abs_diff:.5f})")
+
+
 def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]:
     """Return copies of ``mols`` keeping only the first ``target`` conformers each."""
     out: list[Chem.Mol] = []
@@ -96,6 +128,8 @@ def run(
     seed: int,
     prep_workers: int,
     rdkit_max_seconds: float,
+    validate_count: int,
+    validate_tol: float,
     no_rdkit: bool,
     no_nvmolkit: bool,
     output: str | None,
@@ -120,6 +154,10 @@ def run(
 
     avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols)
     print(f"  {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol")
+    if validate_count > 0 and not no_rdkit and not no_nvmolkit:
+        validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol)
+    elif validate_count > 0:
+        print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)")
 
     print(f"\nSweeping confs_per_mol: {confs_per_mol_list}")
 
@@ -222,6 +260,16 @@ def main():
         parser,
         extra_help="The cap applies per timing iteration and truncates at a molecule boundary.",
     )
+    parser.add_argument(
+        "--validate_count",
+        type=int,
+        default=8,
+        help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)",
+    )
+    parser.add_argument(
+        "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff"
+    )
+    parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check")
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
     parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark")
@@ -238,6 +286,8 @@ def main():
         seed=args.seed,
         prep_workers=args.prep_workers,
         rdkit_max_seconds=args.rdkit_max_seconds,
+        validate_count=0 if args.no_validate else args.validate_count,
+        validate_tol=args.validate_tol,
         no_rdkit=args.no_rdkit,
         no_nvmolkit=args.no_nvmolkit,
         output=args.output,

From 3293e1f96054bde89ed0e54cc87da9eaced95332 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Tue, 2 Jun 2026 12:23:08 -0400
Subject: [PATCH 8/8] Fix std bug

---
 benchmarks/conformer_rmsd_bench.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index ee8d49ff..e9123a0b 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -188,7 +188,8 @@ def run(
             samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)]
             samples.sort(key=lambda pair: pair[0] / max(pair[1], 1))
             rdkit_time_s, rdkit_done = samples[len(samples) // 2]
-            rdkit_std_s = statistics.stdev([elapsed for elapsed, _ in samples]) if len(samples) > 1 else 0.0
+            per_mol_times = [elapsed / max(done, 1) for elapsed, done in samples]
+            rdkit_std_s = statistics.stdev(per_mol_times) * rdkit_done if len(samples) > 1 else 0.0
             pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done])
             rdkit_mols_per_s = rdkit_done / rdkit_time_s
             rdkit_pairs_per_s = pair_count_done / rdkit_time_s