diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py
index dbd688c5..e9123a0b 100644
--- a/benchmarks/conformer_rmsd_bench.py
+++ b/benchmarks/conformer_rmsd_bench.py
@@ -13,163 +13,286 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Benchmark: GPU vs CPU conformer RMSD matrix computation.
+"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD.
 
 Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's
-CPU GetConformerRMSMatrix across varying conformer counts and molecule sizes.
-
-Usage:
-    python conformer_rmsd_bench.py
-    python conformer_rmsd_bench.py --num-confs 50 100 200 500
-    python conformer_rmsd_bench.py --smiles "CCCCCCCCCC" --num-confs 500
+CPU GetConformerRMSMatrix across varying conformer counts.
 """
 
 import argparse
-import copy
+import csv
+import multiprocessing as mp
+import statistics
+import time
+from pathlib import Path
 
-import numpy as np
 import torch
-from bench_utils import perturb_conformer
+from bench_utils import Deadline, add_rdkit_max_seconds_arg, embed_and_jitter, load_smiles
 from benchmark_timing import time_it
 from rdkit import Chem
-from rdkit.Chem import AllChem, rdDistGeom
+from rdkit.Chem import AllChem
 
-from nvmolkit.conformerRmsd import GetConformerRMSMatrix
+from nvmolkit.conformerRmsd import GetConformerRMSMatrixBatch
 
 
-def _numpy_kabsch_rmsd(p, q):
-    """Independent Kabsch RMSD using numpy SVD."""
-    p_c = p - p.mean(axis=0)
-    q_c = q - q.mean(axis=0)
-    H = p_c.T @ q_c
-    S = np.linalg.svd(H, compute_uv=False)
-    d = np.sign(np.linalg.det(H))
-    S[-1] *= d if d != 0.0 else 1.0
-    Sp = np.sum(p_c**2)
-    Sq = np.sum(q_c**2)
-    return np.sqrt(max((Sp + Sq - 2.0 * np.sum(S)) / len(p), 0.0))
+def prepare_mols(
+    raw_mols: list[Chem.Mol],
+    confs_per_mol: int,
+    seed: int,
+    num_workers: int,
+) -> list[Chem.Mol]:
+    """Embed one base conformer per mol, then perturb to ``confs_per_mol``."""
+    workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2)
+    return embed_and_jitter(
+        raw_mols,
+        confs_per_mol=confs_per_mol,
+        seed=seed,
+        num_workers=workers,
+        add_hs=True,
+        min_atoms=2,
+        desc=f"Embed + perturb ({confs_per_mol} confs)",
+    )
 
 
-def benchmark_cpu(mol, n_warmup=1, n_iter=5):
-    """Benchmark RDKit CPU GetConformerRMSMatrix.
+def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]:
+    """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``.
 
-    Deep-copies the molecule before each call because GetConformerRMSMatrix
-    modifies conformer coordinates in-place during alignment; reusing the
-    same molecule would measure already-aligned conformers and understate
-    the true CPU cost.
+    Stops once ``max_seconds`` is exceeded (``0`` means no cap). A fresh mol is
+    built per call because ``GetConformerRMSMatrix`` aligns conformers in place.
     """
-    result = time_it(
-        lambda: AllChem.GetConformerRMSMatrix(copy.deepcopy(mol), prealigned=False), runs=n_iter, warmups=n_warmup
-    )
-    return result.median_s
+    mols = [Chem.Mol(mol_bytes) for mol_bytes in payloads]
+    deadline = Deadline(max_seconds)
+    start = time.perf_counter()
+    n_done = 0
+    for mol in mols:
+        AllChem.GetConformerRMSMatrix(mol, prealigned=False)
+        n_done += 1
+        if deadline.expired():
+            break
+    return time.perf_counter() - start, n_done
 
 
-def benchmark_gpu(mol, n_warmup=2, n_iter=10):
-    """Benchmark nvMolKit GPU GetConformerRMSMatrix."""
-    result = time_it(
-        lambda: GetConformerRMSMatrix(mol, prealigned=False), runs=n_iter, warmups=n_warmup, gpu_sync=True
-    )
-    return result.median_s
-
-
-def run_benchmark(smiles, num_confs_list, seed=42):
-    """Run CPU vs GPU benchmark for a molecule at various conformer counts."""
-    mol_base = Chem.AddHs(Chem.MolFromSmiles(smiles))
-    no_h_base = Chem.RemoveHs(Chem.AddHs(Chem.MolFromSmiles(smiles)))
-    n_atoms = no_h_base.GetNumAtoms()
-
-    print(f"\nMolecule: {smiles}  ({n_atoms} heavy atoms)")
-    print(f"{'Confs':>8}  {'Pairs':>10}  {'CPU (ms)':>10}  {'GPU (ms)':>10}  {'Speedup':>8}  {'Match':>6}")
-    print("-" * 70)
-
-    for num_confs in num_confs_list:
-        mol = Chem.RWMol(mol_base)
-        mol.RemoveAllConformers()
-        params = rdDistGeom.ETKDGv3()
-        params.randomSeed = seed
-        params.useRandomCoords = True
-        if rdDistGeom.EmbedMolecule(mol, params=params) < 0:
-            print(f"{num_confs:>8}  {'skipped (embedding failed)':>50}")
-            continue
-        if num_confs < 2:
-            print(f"{num_confs:>8}  {'skipped (need >= 2 confs for RMSD)':>50}")
-            continue
-        base_conf_id = mol.GetConformer().GetId()
-        for conf_idx in range(1, num_confs):
-            new_conf = Chem.Conformer(mol.GetConformer(base_conf_id))
-            perturb_conformer(new_conf, seed=seed + conf_idx)
-            mol.AddConformer(new_conf, assignId=True)
-        perturb_conformer(mol.GetConformer(base_conf_id), seed=seed)
-        actual_confs = mol.GetNumConformers()
-
-        no_h = Chem.RemoveHs(mol)
-        n_pairs = actual_confs * (actual_confs - 1) // 2
-
-        # CPU benchmark
-        cpu_time = benchmark_cpu(no_h)
-
-        # GPU benchmark
-        gpu_time = benchmark_gpu(no_h)
-
-        # Correctness check against numpy Kabsch SVD (sample up to 500 pairs)
-        gpu_result = GetConformerRMSMatrix(no_h, prealigned=False)
-        torch.cuda.synchronize()
-        gpu_rms = gpu_result.numpy().tolist()
-
-        confs = no_h.GetConformers()
-        coords = [np.array(c.GetPositions()) for c in confs]
-        max_diff = 0.0
-        count = 0
-        max_checks = min(500, n_pairs)
-        for i in range(len(confs)):
-            for j in range(i):
-                idx = i * (i - 1) // 2 + j
-                ref = _numpy_kabsch_rmsd(coords[i], coords[j])
-                max_diff = max(max_diff, abs(gpu_rms[idx] - ref))
-                count += 1
-                if count >= max_checks:
-                    break
-            if count >= max_checks:
-                break
-        match_ok = max_diff < 0.05
-
-        speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf")
-
-        print(
-            f"{actual_confs:>8}  {n_pairs:>10}  {cpu_time * 1000:>10.2f}  "
-            f"{gpu_time * 1000:>10.2f}  {speedup:>7.1f}x  "
-            f"{'OK' if match_ok else f'FAIL ({max_diff:.4f})':>6}"
-        )
+def bench_gpu_batch(mols: list[Chem.Mol]) -> None:
+    results = GetConformerRMSMatrixBatch(mols, prealigned=False)
+    torch.cuda.synchronize()
+
+
+def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None:
+    """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols.
+
+    Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``.
+    """
+    subset = mols[:num_check]
+    if not subset:
+        return
+    print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})")
+    gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False)
+    torch.cuda.synchronize()
+    max_abs_diff = 0.0
+    for mol_idx, mol in enumerate(subset):
+        rdkit_mol = Chem.Mol(mol.ToBinary())
+        rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False)
+        gpu_rms = gpu_results[mol_idx].numpy().tolist()
+        if len(gpu_rms) != len(rdkit_rms):
+            raise RuntimeError(
+                f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})"
+            )
+        for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)):
+            diff = abs(float(gpu_val) - float(rdkit_val))
+            if diff > tol:
+                raise RuntimeError(
+                    f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} "
+                    f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})"
+                )
+            if diff > max_abs_diff:
+                max_abs_diff = diff
+    print(f"  OK (max abs diff {max_abs_diff:.5f})")
+
+
+def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]:
+    """Return copies of ``mols`` keeping only the first ``target`` conformers each."""
+    out: list[Chem.Mol] = []
+    for mol in mols:
+        copy_mol = Chem.Mol(mol, True)  # quickCopy: keeps graph, drops conformers
+        confs = list(mol.GetConformers())[:target]
+        for conf in confs:
+            copy_mol.AddConformer(Chem.Conformer(conf), assignId=True)
+        out.append(copy_mol)
+    return out
+
+
+def run(
+    smiles_path: str,
+    num_mols: int,
+    confs_per_mol_list: list[int],
+    seed: int,
+    prep_workers: int,
+    rdkit_max_seconds: float,
+    validate_count: int,
+    validate_tol: float,
+    no_rdkit: bool,
+    no_nvmolkit: bool,
+    output: str | None,
+) -> None:
+    if no_rdkit and no_nvmolkit:
+        raise ValueError("cannot disable both RDKit and nvMolKit")
+    if any(count < 2 for count in confs_per_mol_list):
+        raise ValueError("every --confs_per_mol value must be >= 2")
+
+    if not no_nvmolkit:
+        print(f"GPU: {torch.cuda.get_device_name(0)}  CUDA: {torch.version.cuda}")
+    print(f"Loading SMILES from {smiles_path} (target {num_mols} mols)")
+    raw = load_smiles(smiles_path, max_count=num_mols, sanitize=True, seed=seed)
+
+    max_confs = max(confs_per_mol_list)
+    print(f"Preparing {len(raw)} mols x {max_confs} conformers (perturb-from-1-embed)")
+    base_mols = prepare_mols(raw, confs_per_mol=max_confs, seed=seed, num_workers=prep_workers)
+    if len(base_mols) > num_mols:
+        base_mols = base_mols[:num_mols]
+    if not base_mols:
+        raise RuntimeError("no molecules survived embedding")
+
+    avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols)
+    print(f"  {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol")
+    if validate_count > 0 and not no_rdkit and not no_nvmolkit:
+        validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol)
+    elif validate_count > 0:
+        print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)")
+
+    print(f"\nSweeping confs_per_mol: {confs_per_mol_list}")
+
+    rows: list[dict[str, float | int | str]] = []
+    for target_confs in sorted(confs_per_mol_list):
+        mols = _slice_to_confs(base_mols, target_confs)
+        actual_confs = [mol.GetNumConformers() for mol in mols]
+        total_pairs = sum(count * (count - 1) // 2 for count in actual_confs)
+        print(f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, {total_pairs} RMSD pairs ===")
+
+        row: dict[str, float | int | str] = {
+            "num_mols": len(mols),
+            "confs_per_mol": target_confs,
+            "total_pairs": total_pairs,
+            "avg_heavy_atoms": avg_atoms,
+        }
+
+        rdkit_mols_per_s: float | None = None
+        rdkit_pairs_per_s: float | None = None
+        if not no_rdkit:
+            payloads = [mol.ToBinary() for mol in mols]
+            cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap"
+            print(f"  RDKit CPU (single-threaded, {cap_label}):")
+            # TODO: replace this hand-rolled warmup/sample/median loop with time_it once
+            # time_it can consume a Deadline and truncate a run mid-workload.
+            # https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/186
+            bench_rdkit_batch(payloads, rdkit_max_seconds)  # warmup
+            samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)]
+            samples.sort(key=lambda pair: pair[0] / max(pair[1], 1))
+            rdkit_time_s, rdkit_done = samples[len(samples) // 2]
+            per_mol_times = [elapsed / max(done, 1) for elapsed, done in samples]
+            rdkit_std_s = statistics.stdev(per_mol_times) * rdkit_done if len(samples) > 1 else 0.0
+            pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done])
+            rdkit_mols_per_s = rdkit_done / rdkit_time_s
+            rdkit_pairs_per_s = pair_count_done / rdkit_time_s
+            truncated = rdkit_done < len(mols)
+            suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else ""
+            print(
+                f"    median wall: {rdkit_time_s * 1000:.1f} +/- {rdkit_std_s * 1000:.1f} ms over {rdkit_done} mols  "
+                f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}"
+            )
+            row["rdkit_median_s"] = rdkit_time_s
+            row["rdkit_std_s"] = rdkit_std_s
+            row["rdkit_mols_processed"] = rdkit_done
+            row["rdkit_truncated"] = int(truncated)
+            row["rdkit_mols_per_s"] = rdkit_mols_per_s
+            row["rdkit_pairs_per_s"] = rdkit_pairs_per_s
+
+        gpu_pairs_per_s: float | None = None
+        if not no_nvmolkit:
+            print("  nvMolKit GPU (batched):")
+            result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True)
+            gpu_time_s = result.median_s
+            gpu_std_s = result.std_ms / 1000.0
+            gpu_pairs_per_s = total_pairs / gpu_time_s
+            print(
+                f"    median wall: {gpu_time_s * 1000:.1f} +/- {gpu_std_s * 1000:.1f} ms  "
+                f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)"
+            )
+            row["gpu_median_s"] = gpu_time_s
+            row["gpu_std_s"] = gpu_std_s
+            row["gpu_mols_per_s"] = len(mols) / gpu_time_s
+            row["gpu_pairs_per_s"] = gpu_pairs_per_s
+
+        if rdkit_pairs_per_s is not None and gpu_pairs_per_s is not None:
+            row["speedup"] = gpu_pairs_per_s / rdkit_pairs_per_s
+            print(f"  GPU speedup vs single-threaded RDKit (pairs/s): {row['speedup']:.1f}x")
+
+        rows.append(row)
+
+    if output and rows:
+        out_path = Path(output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        fieldnames: list[str] = []
+        for row in rows:
+            for key in row:
+                if key not in fieldnames:
+                    fieldnames.append(key)
+        with out_path.open("w", newline="") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        print(f"\nWrote {out_path}")
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Benchmark GPU vs CPU conformer RMSD matrix")
+    parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark")
+    parser.add_argument("--smiles", required=True, help="Path to smiles file")
+    parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample")
     parser.add_argument(
-        "--smiles",
+        "--confs_per_mol",
+        type=int,
         nargs="+",
-        default=[
-            "CC(=O)Oc1ccccc1C(=O)O",  # aspirin (13 HA)
-            "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",  # celecoxib (24 HA)
-            "C=CC(=O)Nc1cc(OC)c(Nc2nccc(-c3cn(C)c4ccccc34)n2)cc1N(C)CCN(C)C",  # osimertinib (33 HA)
-            "CC(C)CC1NC(=O)C(CC(=O)O)NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(CO)NC(=O)C(Cc2c[nH]c3ccccc23)NC1=O",  # cyclic pentapeptide (~48 HA)
-        ],
-        help="SMILES strings to benchmark",
+        default=[10, 25, 50, 100, 200],
+        help="Conformers-per-molecule sweep points (each >=2)",
     )
     parser.add_argument(
-        "--num-confs",
-        nargs="+",
+        "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)"
+    )
+    add_rdkit_max_seconds_arg(
+        parser,
+        extra_help="The cap applies per timing iteration and truncates at a molecule boundary.",
+    )
+    parser.add_argument(
+        "--validate_count",
         type=int,
-        default=[10, 50, 100, 200, 500],
-        help="Number of conformers to generate",
+        default=8,
+        help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)",
     )
+    parser.add_argument(
+        "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff"
+    )
+    parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output", type=str, default=None, help="Optional CSV output path")
+    parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark")
+    parser.add_argument("--no_nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark")
     args = parser.parse_args()
 
-    device_name = torch.cuda.get_device_name(0)
-    print(f"GPU: {device_name}")
-    print(f"CUDA: {torch.version.cuda}")
+    if args.no_rdkit and args.no_nvmolkit:
+        parser.error("cannot pass both --no_rdkit and --no_nvmolkit")
 
-    for smiles in args.smiles:
-        run_benchmark(smiles, args.num_confs)
+    run(
+        smiles_path=args.smiles,
+        num_mols=args.num_mols,
+        confs_per_mol_list=args.confs_per_mol,
+        seed=args.seed,
+        prep_workers=args.prep_workers,
+        rdkit_max_seconds=args.rdkit_max_seconds,
+        validate_count=0 if args.no_validate else args.validate_count,
+        validate_tol=args.validate_tol,
+        no_rdkit=args.no_rdkit,
+        no_nvmolkit=args.no_nvmolkit,
+        output=args.output,
+    )
 
     print("\nDone.")