From 7b36907371280c160e0107300ed95b314966294a Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Fri, 29 May 2026 14:17:16 -0400 Subject: [PATCH 1/8] conformer_rmsd: rewrite to batch-mode benchmark Replace the per-mol, hardcoded-SMILES benchmark with a batch-mode bench that: * Loads a slice of SMILES from a file (via bench_utils.load_smiles). * Embeds one base conformer per mol in parallel and jitters via the shared bench_utils.embed_and_jitter (with add_hs so ETKDG sees a chemically reasonable graph). * Times a single GetConformerRMSMatrixBatch call vs a serial RDKit loop. * Sweeps confs_per_mol with a single embed run plus _slice_to_confs reuse, so every row sees the same molecule selection. * Validates GPU output against RDKit (per-pair, with a tolerance) before timing and aborts on mismatch. * Honors --rdkit_max_seconds for the RDKit comparison and --no-rdkit / --no-nvmolkit for mode selection. --- benchmarks/conformer_rmsd_bench.py | 401 +++++++++++++++++++---------- 1 file changed, 271 insertions(+), 130 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index dbd688c5..83e0f63a 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -13,163 +13,304 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Benchmark: GPU vs CPU conformer RMSD matrix computation. +"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine. -Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's -CPU GetConformerRMSMatrix across varying conformer counts and molecule sizes. +Mirrors the sampling strategy used by the FF / TFD benches: load a slice of +Enamine REAL, embed one ETKDGv3 base conformer per molecule in parallel, and +derive the remaining conformers by jittering the base structure. The two +implementations being compared then process the *batch* of molecules: -Usage: - python conformer_rmsd_bench.py - python conformer_rmsd_bench.py --num-confs 50 100 200 500 - python conformer_rmsd_bench.py --smiles "CCCCCCCCCC" --num-confs 500 +* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call covers every mol. +* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` called in a serial loop, + matching the head-to-head convention of the other single-GPU benches + (butina, cross_similarity, tfd) which all compare against single-threaded + RDKit. + +Throughput is reported in molecules/s and pair-RMSDs/s so configurations with +different conformer counts can be compared apples-to-apples. """ import argparse -import copy +import csv +import multiprocessing as mp +import time +from pathlib import Path -import numpy as np import torch -from bench_utils import perturb_conformer +from bench_utils import embed_and_jitter, load_smiles from benchmark_timing import time_it from rdkit import Chem -from rdkit.Chem import AllChem, rdDistGeom +from rdkit.Chem import AllChem + +from nvmolkit.conformerRmsd import GetConformerRMSMatrixBatch -from nvmolkit.conformerRmsd import GetConformerRMSMatrix +def prepare_mols( + raw_mols: list[Chem.Mol], + confs_per_mol: int, + seed: int, + num_workers: int, +) -> list[Chem.Mol]: + """Embed one base conformer per mol, then perturb to ``confs_per_mol``. -def _numpy_kabsch_rmsd(p, q): - """Independent Kabsch RMSD using numpy SVD.""" - p_c = p - p.mean(axis=0) - q_c = q - q.mean(axis=0) - H = p_c.T @ q_c - S = np.linalg.svd(H, compute_uv=False) - d = np.sign(np.linalg.det(H)) - S[-1] *= d if d != 0.0 else 1.0 - Sp = np.sum(p_c**2) - Sq = np.sum(q_c**2) - return np.sqrt(max((Sp + Sq - 2.0 * np.sum(S)) / len(p), 0.0)) + Thin wrapper over :func:`bench_utils.embed_and_jitter` that enforces + confs_per_mol >= 2 (RMSD needs at least one pair) and adds explicit + hydrogens during embedding so ETKDG sees a chemically reasonable graph. + """ + if confs_per_mol < 2: + raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}") + workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2) + return embed_and_jitter( + raw_mols, + confs_per_mol=confs_per_mol, + seed=seed, + num_workers=workers, + add_hs=True, + min_atoms=2, + desc=f"Embed + perturb ({confs_per_mol} confs)", + ) -def benchmark_cpu(mol, n_warmup=1, n_iter=5): - """Benchmark RDKit CPU GetConformerRMSMatrix. +def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]: + """One RDKit timing iteration: serial loop, returns ``(elapsed_s, n_done)``. - Deep-copies the molecule before each call because GetConformerRMSMatrix - modifies conformer coordinates in-place during alignment; reusing the - same molecule would measure already-aligned conformers and understate - the true CPU cost. + When ``max_seconds > 0`` the loop breaks after the deadline is exceeded; + callers compute throughput as ``n_done / elapsed_s`` so a truncated run + is still extrapolated to a fair pairs/s figure. ``GetConformerRMSMatrix`` + mutates conformer coordinates in-place during Kabsch alignment, so each + call gets a fresh deserialization. """ - result = time_it( - lambda: AllChem.GetConformerRMSMatrix(copy.deepcopy(mol), prealigned=False), runs=n_iter, warmups=n_warmup - ) - return result.median_s + deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None + start = time.perf_counter() + n_done = 0 + for mol_bytes in payloads: + mol = Chem.Mol(mol_bytes) + AllChem.GetConformerRMSMatrix(mol, prealigned=False) + n_done += 1 + if deadline is not None and time.perf_counter() >= deadline: + break + return time.perf_counter() - start, n_done -def benchmark_gpu(mol, n_warmup=2, n_iter=10): - """Benchmark nvMolKit GPU GetConformerRMSMatrix.""" - result = time_it( - lambda: GetConformerRMSMatrix(mol, prealigned=False), runs=n_iter, warmups=n_warmup, gpu_sync=True - ) - return result.median_s - - -def run_benchmark(smiles, num_confs_list, seed=42): - """Run CPU vs GPU benchmark for a molecule at various conformer counts.""" - mol_base = Chem.AddHs(Chem.MolFromSmiles(smiles)) - no_h_base = Chem.RemoveHs(Chem.AddHs(Chem.MolFromSmiles(smiles))) - n_atoms = no_h_base.GetNumAtoms() - - print(f"\nMolecule: {smiles} ({n_atoms} heavy atoms)") - print(f"{'Confs':>8} {'Pairs':>10} {'CPU (ms)':>10} {'GPU (ms)':>10} {'Speedup':>8} {'Match':>6}") - print("-" * 70) - - for num_confs in num_confs_list: - mol = Chem.RWMol(mol_base) - mol.RemoveAllConformers() - params = rdDistGeom.ETKDGv3() - params.randomSeed = seed - params.useRandomCoords = True - if rdDistGeom.EmbedMolecule(mol, params=params) < 0: - print(f"{num_confs:>8} {'skipped (embedding failed)':>50}") - continue - if num_confs < 2: - print(f"{num_confs:>8} {'skipped (need >= 2 confs for RMSD)':>50}") - continue - base_conf_id = mol.GetConformer().GetId() - for conf_idx in range(1, num_confs): - new_conf = Chem.Conformer(mol.GetConformer(base_conf_id)) - perturb_conformer(new_conf, seed=seed + conf_idx) - mol.AddConformer(new_conf, assignId=True) - perturb_conformer(mol.GetConformer(base_conf_id), seed=seed) - actual_confs = mol.GetNumConformers() - - no_h = Chem.RemoveHs(mol) - n_pairs = actual_confs * (actual_confs - 1) // 2 - - # CPU benchmark - cpu_time = benchmark_cpu(no_h) - - # GPU benchmark - gpu_time = benchmark_gpu(no_h) - - # Correctness check against numpy Kabsch SVD (sample up to 500 pairs) - gpu_result = GetConformerRMSMatrix(no_h, prealigned=False) - torch.cuda.synchronize() - gpu_rms = gpu_result.numpy().tolist() - - confs = no_h.GetConformers() - coords = [np.array(c.GetPositions()) for c in confs] - max_diff = 0.0 - count = 0 - max_checks = min(500, n_pairs) - for i in range(len(confs)): - for j in range(i): - idx = i * (i - 1) // 2 + j - ref = _numpy_kabsch_rmsd(coords[i], coords[j]) - max_diff = max(max_diff, abs(gpu_rms[idx] - ref)) - count += 1 - if count >= max_checks: - break - if count >= max_checks: - break - match_ok = max_diff < 0.05 - - speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf") +def bench_gpu_batch(mols: list[Chem.Mol]) -> None: + results = GetConformerRMSMatrixBatch(mols, prealigned=False) + for result in results: + result.torch() + torch.cuda.synchronize() + + +def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: + """Diff GPU RMSD matrices against RDKit on the first ``num_check`` mols. + + Untimed; runs once before the sweep. Each pair of conformers in each mol + is compared element-wise; mismatches abort the benchmark so we never + publish a number for a broken kernel. + """ + subset = mols[:num_check] + if not subset: + return + print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})") + gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False) + torch.cuda.synchronize() + max_abs_diff = 0.0 + for mol_idx, mol in enumerate(subset): + rdkit_mol = Chem.Mol(mol.ToBinary()) + rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) + gpu_rms = gpu_results[mol_idx].numpy().tolist() + if len(gpu_rms) != len(rdkit_rms): + raise RuntimeError( + f"validation: mol {mol_idx} pair count mismatch " + f"(gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" + ) + for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)): + diff = abs(float(gpu_val) - float(rdkit_val)) + if diff > tol: + raise RuntimeError( + f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} " + f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})" + ) + if diff > max_abs_diff: + max_abs_diff = diff + print(f" OK (max abs diff {max_abs_diff:.5f})") + + +def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]: + """Return copies of ``mols`` keeping only the first ``target`` conformers each. + + The shared base set is prepared once at the maximum conformer count; this + helper produces the per-sweep-point view without re-embedding so every + sweep row sees the exact same molecule selection and base geometries. + """ + out: list[Chem.Mol] = [] + for mol in mols: + copy_mol = Chem.Mol(mol, True) # quickCopy: keeps graph, drops conformers + confs = list(mol.GetConformers())[:target] + for conf in confs: + copy_mol.AddConformer(Chem.Conformer(conf), assignId=True) + out.append(copy_mol) + return out + +def run( + smiles_path: str, + num_mols: int, + confs_per_mol_list: list[int], + seed: int, + prep_workers: int, + rdkit_max_seconds: float, + validate_count: int, + validate_tol: float, + no_rdkit: bool, + no_nvmolkit: bool, + output: str | None, +) -> None: + if no_rdkit and no_nvmolkit: + raise ValueError("cannot disable both RDKit and nvMolKit") + if any(count < 2 for count in confs_per_mol_list): + raise ValueError("every --confs_per_mol value must be >= 2") + + if not no_nvmolkit: + print(f"GPU: {torch.cuda.get_device_name(0)} CUDA: {torch.version.cuda}") + print(f"Loading SMILES from {smiles_path} (target {num_mols} mols)") + raw = load_smiles(smiles_path, max_count=num_mols, sanitize=True, seed=seed) + + max_confs = max(confs_per_mol_list) + print(f"Preparing {len(raw)} mols x {max_confs} conformers (perturb-from-1-embed)") + base_mols = prepare_mols(raw, confs_per_mol=max_confs, seed=seed, num_workers=prep_workers) + if len(base_mols) > num_mols: + base_mols = base_mols[:num_mols] + if not base_mols: + raise RuntimeError("no molecules survived embedding") + + avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols) + print(f" {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol") + if validate_count > 0 and not no_rdkit and not no_nvmolkit: + validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol) + elif validate_count > 0: + print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)") + + print(f"\nSweeping confs_per_mol: {confs_per_mol_list}") + + rows: list[dict[str, float | int | str]] = [] + for target_confs in sorted(confs_per_mol_list): + mols = _slice_to_confs(base_mols, target_confs) + actual_confs = [mol.GetNumConformers() for mol in mols] + total_pairs = sum(count * (count - 1) // 2 for count in actual_confs) print( - f"{actual_confs:>8} {n_pairs:>10} {cpu_time * 1000:>10.2f} " - f"{gpu_time * 1000:>10.2f} {speedup:>7.1f}x " - f"{'OK' if match_ok else f'FAIL ({max_diff:.4f})':>6}" + f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, " + f"{total_pairs} RMSD pairs ===" ) + row: dict[str, float | int | str] = { + "num_mols": len(mols), + "confs_per_mol": target_confs, + "total_pairs": total_pairs, + "avg_heavy_atoms": avg_atoms, + } + + rdkit_mols_per_s: float | None = None + rdkit_pairs_per_s: float | None = None + if not no_rdkit: + payloads = [mol.ToBinary() for mol in mols] + cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap" + print(f" RDKit CPU (single-threaded, {cap_label}):") + bench_rdkit_batch(payloads, rdkit_max_seconds) # warmup + samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)] + samples.sort(key=lambda pair: pair[0] / max(pair[1], 1)) + rdkit_time_s, rdkit_done = samples[len(samples) // 2] + pair_count_done = sum( + count * (count - 1) // 2 for count in actual_confs[:rdkit_done] + ) + rdkit_mols_per_s = rdkit_done / rdkit_time_s + rdkit_pairs_per_s = pair_count_done / rdkit_time_s + truncated = rdkit_done < len(mols) + suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else "" + print( + f" median wall: {rdkit_time_s * 1000:.1f} ms over {rdkit_done} mols " + f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}" + ) + row["rdkit_median_s"] = rdkit_time_s + row["rdkit_mols_processed"] = rdkit_done + row["rdkit_truncated"] = int(truncated) + row["rdkit_mols_per_s"] = rdkit_mols_per_s + row["rdkit_pairs_per_s"] = rdkit_pairs_per_s + + gpu_pairs_per_s: float | None = None + if not no_nvmolkit: + print(" nvMolKit GPU (batched):") + result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True) + gpu_time_s = result.median_s + gpu_pairs_per_s = total_pairs / gpu_time_s + print( + f" median wall: {gpu_time_s * 1000:.1f} ms " + f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)" + ) + row["gpu_median_s"] = gpu_time_s + row["gpu_mols_per_s"] = len(mols) / gpu_time_s + row["gpu_pairs_per_s"] = gpu_pairs_per_s + + if rdkit_pairs_per_s is not None and gpu_pairs_per_s is not None: + row["speedup"] = gpu_pairs_per_s / rdkit_pairs_per_s + print(f" GPU speedup vs single-threaded RDKit (pairs/s): {row['speedup']:.1f}x") + + rows.append(row) + + if output and rows: + out_path = Path(output) + out_path.parent.mkdir(parents=True, exist_ok=True) + fieldnames: list[str] = [] + for row in rows: + for key in row: + if key not in fieldnames: + fieldnames.append(key) + with out_path.open("w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + print(f"\nWrote {out_path}") + def main(): - parser = argparse.ArgumentParser(description="Benchmark GPU vs CPU conformer RMSD matrix") - parser.add_argument( - "--smiles", - nargs="+", - default=[ - "CC(=O)Oc1ccccc1C(=O)O", # aspirin (13 HA) - "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", # celecoxib (24 HA) - "C=CC(=O)Nc1cc(OC)c(Nc2nccc(-c3cn(C)c4ccccc34)n2)cc1N(C)CCN(C)C", # osimertinib (33 HA) - "CC(C)CC1NC(=O)C(CC(=O)O)NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(CO)NC(=O)C(Cc2c[nH]c3ccccc23)NC1=O", # cyclic pentapeptide (~48 HA) - ], - help="SMILES strings to benchmark", - ) - parser.add_argument( - "--num-confs", - nargs="+", - type=int, - default=[10, 50, 100, 200, 500], - help="Number of conformers to generate", - ) + parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine") + parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file") + parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample") + parser.add_argument("--confs_per_mol", type=int, nargs="+", default=[10, 25, 50, 100, 200], + help="Conformers-per-molecule sweep points (each >=2)") + parser.add_argument("--prep_workers", type=int, default=0, + help="Workers for the embed-and-perturb prep step (0 = half of CPUs)") + parser.add_argument("--rdkit_max_seconds", type=float, default=0.0, + help="Per-iteration wall-clock cap on the RDKit comparison " + "(0 = no cap). When exceeded, throughput is reported " + "over the molecules actually processed.") + parser.add_argument("--validate_count", type=int, default=8, + help="Number of mols to compare GPU vs RDKit before timing " + "(0 disables; requires both backends enabled)") + parser.add_argument("--validate_tol", type=float, default=0.05, + help="Absolute tolerance (Angstroms) for per-pair RMSD diff") + parser.add_argument("--no_validate", action="store_true", + help="Skip the GPU-vs-RDKit correctness check") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") + parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark") + parser.add_argument("--no-nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark") args = parser.parse_args() - device_name = torch.cuda.get_device_name(0) - print(f"GPU: {device_name}") - print(f"CUDA: {torch.version.cuda}") + if args.no_rdkit and args.no_nvmolkit: + parser.error("cannot pass both --no-rdkit and --no-nvmolkit") - for smiles in args.smiles: - run_benchmark(smiles, args.num_confs) + run( + smiles_path=args.smiles, + num_mols=args.num_mols, + confs_per_mol_list=args.confs_per_mol, + seed=args.seed, + prep_workers=args.prep_workers, + rdkit_max_seconds=args.rdkit_max_seconds, + validate_count=0 if args.no_validate else args.validate_count, + validate_tol=args.validate_tol, + no_rdkit=args.no_rdkit, + no_nvmolkit=args.no_nvmolkit, + output=args.output, + ) print("\nDone.") From 390f9ff6abd696e43340af4f0b03fe5fdfd1caaa Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Fri, 29 May 2026 17:35:53 -0400 Subject: [PATCH 2/8] conformer_rmsd: trim docstrings to drop cross-bench rationale and narration --- benchmarks/conformer_rmsd_bench.py | 52 +++++++++++------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index 83e0f63a..983d00b8 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -15,19 +15,14 @@ """Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine. -Mirrors the sampling strategy used by the FF / TFD benches: load a slice of -Enamine REAL, embed one ETKDGv3 base conformer per molecule in parallel, and -derive the remaining conformers by jittering the base structure. The two -implementations being compared then process the *batch* of molecules: - -* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call covers every mol. -* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` called in a serial loop, - matching the head-to-head convention of the other single-GPU benches - (butina, cross_similarity, tfd) which all compare against single-threaded - RDKit. - -Throughput is reported in molecules/s and pair-RMSDs/s so configurations with -different conformer counts can be compared apples-to-apples. +Loads a slice of Enamine REAL, embeds one ETKDGv3 base conformer per molecule +and jitters it to produce the remaining conformers, then compares two +implementations over the batch of molecules: + +* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call. +* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` in a serial loop. + +Throughput is reported in molecules/s and pair-RMSDs/s. """ import argparse @@ -53,9 +48,7 @@ def prepare_mols( ) -> list[Chem.Mol]: """Embed one base conformer per mol, then perturb to ``confs_per_mol``. - Thin wrapper over :func:`bench_utils.embed_and_jitter` that enforces - confs_per_mol >= 2 (RMSD needs at least one pair) and adds explicit - hydrogens during embedding so ETKDG sees a chemically reasonable graph. + Requires ``confs_per_mol >= 2`` since RMSD needs at least one conformer pair. """ if confs_per_mol < 2: raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}") @@ -72,13 +65,11 @@ def prepare_mols( def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]: - """One RDKit timing iteration: serial loop, returns ``(elapsed_s, n_done)``. + """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``. - When ``max_seconds > 0`` the loop breaks after the deadline is exceeded; - callers compute throughput as ``n_done / elapsed_s`` so a truncated run - is still extrapolated to a fair pairs/s figure. ``GetConformerRMSMatrix`` - mutates conformer coordinates in-place during Kabsch alignment, so each - call gets a fresh deserialization. + Stops once ``max_seconds`` is exceeded (``0`` means no cap). Each iteration + deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer + coordinates in-place during Kabsch alignment. """ deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None start = time.perf_counter() @@ -100,11 +91,9 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None: def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: - """Diff GPU RMSD matrices against RDKit on the first ``num_check`` mols. + """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols. - Untimed; runs once before the sweep. Each pair of conformers in each mol - is compared element-wise; mismatches abort the benchmark so we never - publish a number for a broken kernel. + Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``. """ subset = mols[:num_check] if not subset: @@ -135,12 +124,7 @@ def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]: - """Return copies of ``mols`` keeping only the first ``target`` conformers each. - - The shared base set is prepared once at the maximum conformer count; this - helper produces the per-sweep-point view without re-embedding so every - sweep row sees the exact same molecule selection and base geometries. - """ + """Return copies of ``mols`` keeping only the first ``target`` conformers each.""" out: list[Chem.Mol] = [] for mol in mols: copy_mol = Chem.Mol(mol, True) # quickCopy: keeps graph, drops conformers @@ -263,8 +247,8 @@ def run( for key in row: if key not in fieldnames: fieldnames.append(key) - with out_path.open("w", newline="") as fh: - writer = csv.DictWriter(fh, fieldnames=fieldnames) + with out_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) print(f"\nWrote {out_path}") From a80a9d1a19b3d7d6e35d9d93a042c873a5846f91 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 08:39:10 -0400 Subject: [PATCH 3/8] conformer_rmsd: reflow long argparse calls onto single lines Cosmetic only; matches the formatting style used in adjacent benches. --- benchmarks/conformer_rmsd_bench.py | 55 +++++++++++++++++------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index 983d00b8..b6f113c2 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -108,8 +108,7 @@ def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: gpu_rms = gpu_results[mol_idx].numpy().tolist() if len(gpu_rms) != len(rdkit_rms): raise RuntimeError( - f"validation: mol {mol_idx} pair count mismatch " - f"(gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" + f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" ) for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)): diff = abs(float(gpu_val) - float(rdkit_val)) @@ -180,10 +179,7 @@ def run( mols = _slice_to_confs(base_mols, target_confs) actual_confs = [mol.GetNumConformers() for mol in mols] total_pairs = sum(count * (count - 1) // 2 for count in actual_confs) - print( - f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, " - f"{total_pairs} RMSD pairs ===" - ) + print(f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, {total_pairs} RMSD pairs ===") row: dict[str, float | int | str] = { "num_mols": len(mols), @@ -202,9 +198,7 @@ def run( samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)] samples.sort(key=lambda pair: pair[0] / max(pair[1], 1)) rdkit_time_s, rdkit_done = samples[len(samples) // 2] - pair_count_done = sum( - count * (count - 1) // 2 for count in actual_confs[:rdkit_done] - ) + pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done]) rdkit_mols_per_s = rdkit_done / rdkit_time_s rdkit_pairs_per_s = pair_count_done / rdkit_time_s truncated = rdkit_done < len(mols) @@ -258,21 +252,34 @@ def main(): parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine") parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file") parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample") - parser.add_argument("--confs_per_mol", type=int, nargs="+", default=[10, 25, 50, 100, 200], - help="Conformers-per-molecule sweep points (each >=2)") - parser.add_argument("--prep_workers", type=int, default=0, - help="Workers for the embed-and-perturb prep step (0 = half of CPUs)") - parser.add_argument("--rdkit_max_seconds", type=float, default=0.0, - help="Per-iteration wall-clock cap on the RDKit comparison " - "(0 = no cap). When exceeded, throughput is reported " - "over the molecules actually processed.") - parser.add_argument("--validate_count", type=int, default=8, - help="Number of mols to compare GPU vs RDKit before timing " - "(0 disables; requires both backends enabled)") - parser.add_argument("--validate_tol", type=float, default=0.05, - help="Absolute tolerance (Angstroms) for per-pair RMSD diff") - parser.add_argument("--no_validate", action="store_true", - help="Skip the GPU-vs-RDKit correctness check") + parser.add_argument( + "--confs_per_mol", + type=int, + nargs="+", + default=[10, 25, 50, 100, 200], + help="Conformers-per-molecule sweep points (each >=2)", + ) + parser.add_argument( + "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)" + ) + parser.add_argument( + "--rdkit_max_seconds", + type=float, + default=0.0, + help="Per-iteration wall-clock cap on the RDKit comparison " + "(0 = no cap). When exceeded, throughput is reported " + "over the molecules actually processed.", + ) + parser.add_argument( + "--validate_count", + type=int, + default=8, + help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)", + ) + parser.add_argument( + "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff" + ) + parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark") From 5ba61fc1610f1f2ba7985ccc5ee2429c78471a77 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 08:44:33 -0400 Subject: [PATCH 4/8] Fix some conf benchmark things --- benchmarks/conformer_rmsd_bench.py | 31 ++++++++++-------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index b6f113c2..324e462c 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -15,14 +15,9 @@ """Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine. -Loads a slice of Enamine REAL, embeds one ETKDGv3 base conformer per molecule -and jitters it to produce the remaining conformers, then compares two -implementations over the batch of molecules: +Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's +CPU GetConformerRMSMatrix across varying conformer counts. -* nvMolKit GPU: a single ``GetConformerRMSMatrixBatch`` call. -* RDKit CPU: ``AllChem.GetConformerRMSMatrix`` in a serial loop. - -Throughput is reported in molecules/s and pair-RMSDs/s. """ import argparse @@ -32,7 +27,7 @@ from pathlib import Path import torch -from bench_utils import embed_and_jitter, load_smiles +from bench_utils import Deadline, add_rdkit_max_seconds_arg, embed_and_jitter, load_smiles from benchmark_timing import time_it from rdkit import Chem from rdkit.Chem import AllChem @@ -71,22 +66,20 @@ def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer coordinates in-place during Kabsch alignment. """ - deadline = time.perf_counter() + max_seconds if max_seconds > 0 else None + deadline = Deadline(max_seconds) start = time.perf_counter() n_done = 0 for mol_bytes in payloads: mol = Chem.Mol(mol_bytes) AllChem.GetConformerRMSMatrix(mol, prealigned=False) n_done += 1 - if deadline is not None and time.perf_counter() >= deadline: + if deadline.expired(): break return time.perf_counter() - start, n_done def bench_gpu_batch(mols: list[Chem.Mol]) -> None: results = GetConformerRMSMatrixBatch(mols, prealigned=False) - for result in results: - result.torch() torch.cuda.synchronize() @@ -249,8 +242,8 @@ def run( def main(): - parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark vs Enamine") - parser.add_argument("--smiles", required=True, help="Path to Enamine (or any) SMILES/cxsmiles file") + parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark") + parser.add_argument("--smiles", required=True, help="Path to smiles file") parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample") parser.add_argument( "--confs_per_mol", @@ -262,13 +255,9 @@ def main(): parser.add_argument( "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)" ) - parser.add_argument( - "--rdkit_max_seconds", - type=float, - default=0.0, - help="Per-iteration wall-clock cap on the RDKit comparison " - "(0 = no cap). When exceeded, throughput is reported " - "over the molecules actually processed.", + add_rdkit_max_seconds_arg( + parser, + extra_help="The cap applies per timing iteration and truncates at a molecule boundary.", ) parser.add_argument( "--validate_count", From 786d47ecfb716777158705dfb6319b980be1c6d4 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 08:59:47 -0400 Subject: [PATCH 5/8] Remove some stuff --- benchmarks/conformer_rmsd_bench.py | 71 ++++++------------------------ 1 file changed, 14 insertions(+), 57 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index 324e462c..c734eb96 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -23,6 +23,7 @@ import argparse import csv import multiprocessing as mp +import statistics import time from pathlib import Path @@ -62,15 +63,14 @@ def prepare_mols( def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]: """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``. - Stops once ``max_seconds`` is exceeded (``0`` means no cap). Each iteration - deserializes a fresh mol because ``GetConformerRMSMatrix`` mutates conformer - coordinates in-place during Kabsch alignment. + Stops once ``max_seconds`` is exceeded (``0`` means no cap). A fresh mol is + built per call because ``GetConformerRMSMatrix`` aligns conformers in place. """ + mols = [Chem.Mol(mol_bytes) for mol_bytes in payloads] deadline = Deadline(max_seconds) start = time.perf_counter() n_done = 0 - for mol_bytes in payloads: - mol = Chem.Mol(mol_bytes) + for mol in mols: AllChem.GetConformerRMSMatrix(mol, prealigned=False) n_done += 1 if deadline.expired(): @@ -83,38 +83,6 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None: torch.cuda.synchronize() -def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: - """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols. - - Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``. - """ - subset = mols[:num_check] - if not subset: - return - print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})") - gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False) - torch.cuda.synchronize() - max_abs_diff = 0.0 - for mol_idx, mol in enumerate(subset): - rdkit_mol = Chem.Mol(mol.ToBinary()) - rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) - gpu_rms = gpu_results[mol_idx].numpy().tolist() - if len(gpu_rms) != len(rdkit_rms): - raise RuntimeError( - f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" - ) - for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)): - diff = abs(float(gpu_val) - float(rdkit_val)) - if diff > tol: - raise RuntimeError( - f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} " - f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})" - ) - if diff > max_abs_diff: - max_abs_diff = diff - print(f" OK (max abs diff {max_abs_diff:.5f})") - - def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]: """Return copies of ``mols`` keeping only the first ``target`` conformers each.""" out: list[Chem.Mol] = [] @@ -134,8 +102,6 @@ def run( seed: int, prep_workers: int, rdkit_max_seconds: float, - validate_count: int, - validate_tol: float, no_rdkit: bool, no_nvmolkit: bool, output: str | None, @@ -160,10 +126,6 @@ def run( avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols) print(f" {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol") - if validate_count > 0 and not no_rdkit and not no_nvmolkit: - validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol) - elif validate_count > 0: - print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)") print(f"\nSweeping confs_per_mol: {confs_per_mol_list}") @@ -187,20 +149,25 @@ def run( payloads = [mol.ToBinary() for mol in mols] cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap" print(f" RDKit CPU (single-threaded, {cap_label}):") + # TODO: replace this hand-rolled warmup/sample/median loop with time_it once + # time_it can consume a Deadline and truncate a run mid-workload. + # https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/186 bench_rdkit_batch(payloads, rdkit_max_seconds) # warmup samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)] samples.sort(key=lambda pair: pair[0] / max(pair[1], 1)) rdkit_time_s, rdkit_done = samples[len(samples) // 2] + rdkit_std_s = statistics.stdev([elapsed for elapsed, _ in samples]) if len(samples) > 1 else 0.0 pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done]) rdkit_mols_per_s = rdkit_done / rdkit_time_s rdkit_pairs_per_s = pair_count_done / rdkit_time_s truncated = rdkit_done < len(mols) suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else "" print( - f" median wall: {rdkit_time_s * 1000:.1f} ms over {rdkit_done} mols " + f" median wall: {rdkit_time_s * 1000:.1f} +/- {rdkit_std_s * 1000:.1f} ms over {rdkit_done} mols " f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}" ) row["rdkit_median_s"] = rdkit_time_s + row["rdkit_std_s"] = rdkit_std_s row["rdkit_mols_processed"] = rdkit_done row["rdkit_truncated"] = int(truncated) row["rdkit_mols_per_s"] = rdkit_mols_per_s @@ -211,12 +178,14 @@ def run( print(" nvMolKit GPU (batched):") result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True) gpu_time_s = result.median_s + gpu_std_s = result.std_ms / 1000.0 gpu_pairs_per_s = total_pairs / gpu_time_s print( - f" median wall: {gpu_time_s * 1000:.1f} ms " + f" median wall: {gpu_time_s * 1000:.1f} +/- {gpu_std_s * 1000:.1f} ms " f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)" ) row["gpu_median_s"] = gpu_time_s + row["gpu_std_s"] = gpu_std_s row["gpu_mols_per_s"] = len(mols) / gpu_time_s row["gpu_pairs_per_s"] = gpu_pairs_per_s @@ -259,16 +228,6 @@ def main(): parser, extra_help="The cap applies per timing iteration and truncates at a molecule boundary.", ) - parser.add_argument( - "--validate_count", - type=int, - default=8, - help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)", - ) - parser.add_argument( - "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff" - ) - parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark") @@ -285,8 +244,6 @@ def main(): seed=args.seed, prep_workers=args.prep_workers, rdkit_max_seconds=args.rdkit_max_seconds, - validate_count=0 if args.no_validate else args.validate_count, - validate_tol=args.validate_tol, no_rdkit=args.no_rdkit, no_nvmolkit=args.no_nvmolkit, output=args.output, From 34a284b1cdad48965ae8dd6997f9fc1af70d5c08 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 09:03:25 -0400 Subject: [PATCH 6/8] Fix minor things --- benchmarks/conformer_rmsd_bench.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index c734eb96..92e0e91f 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -13,11 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD on Enamine. +"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD. Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's CPU GetConformerRMSMatrix across varying conformer counts. - """ import argparse @@ -42,12 +41,7 @@ def prepare_mols( seed: int, num_workers: int, ) -> list[Chem.Mol]: - """Embed one base conformer per mol, then perturb to ``confs_per_mol``. - - Requires ``confs_per_mol >= 2`` since RMSD needs at least one conformer pair. - """ - if confs_per_mol < 2: - raise ValueError(f"confs_per_mol must be >= 2, got {confs_per_mol}") + """Embed one base conformer per mol, then perturb to ``confs_per_mol``.""" workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2) return embed_and_jitter( raw_mols, @@ -230,12 +224,12 @@ def main(): ) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") - parser.add_argument("--no-rdkit", action="store_true", help="Skip RDKit CPU benchmark") - parser.add_argument("--no-nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark") + parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark") + parser.add_argument("--no_nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark") args = parser.parse_args() if args.no_rdkit and args.no_nvmolkit: - parser.error("cannot pass both --no-rdkit and --no-nvmolkit") + parser.error("cannot pass both --no_rdkit and --no_nvmolkit") run( smiles_path=args.smiles, From 3700a3c60352a7e6e7dd3b287509247bb9f283b3 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 09:07:21 -0400 Subject: [PATCH 7/8] Add validation back in --- benchmarks/conformer_rmsd_bench.py | 50 ++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index 92e0e91f..ee8d49ff 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -77,6 +77,38 @@ def bench_gpu_batch(mols: list[Chem.Mol]) -> None: torch.cuda.synchronize() +def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: + """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols. + + Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``. + """ + subset = mols[:num_check] + if not subset: + return + print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})") + gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False) + torch.cuda.synchronize() + max_abs_diff = 0.0 + for mol_idx, mol in enumerate(subset): + rdkit_mol = Chem.Mol(mol.ToBinary()) + rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) + gpu_rms = gpu_results[mol_idx].numpy().tolist() + if len(gpu_rms) != len(rdkit_rms): + raise RuntimeError( + f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" + ) + for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)): + diff = abs(float(gpu_val) - float(rdkit_val)) + if diff > tol: + raise RuntimeError( + f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} " + f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})" + ) + if diff > max_abs_diff: + max_abs_diff = diff + print(f" OK (max abs diff {max_abs_diff:.5f})") + + def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]: """Return copies of ``mols`` keeping only the first ``target`` conformers each.""" out: list[Chem.Mol] = [] @@ -96,6 +128,8 @@ def run( seed: int, prep_workers: int, rdkit_max_seconds: float, + validate_count: int, + validate_tol: float, no_rdkit: bool, no_nvmolkit: bool, output: str | None, @@ -120,6 +154,10 @@ def run( avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols) print(f" {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol") + if validate_count > 0 and not no_rdkit and not no_nvmolkit: + validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol) + elif validate_count > 0: + print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)") print(f"\nSweeping confs_per_mol: {confs_per_mol_list}") @@ -222,6 +260,16 @@ def main(): parser, extra_help="The cap applies per timing iteration and truncates at a molecule boundary.", ) + parser.add_argument( + "--validate_count", + type=int, + default=8, + help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)", + ) + parser.add_argument( + "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff" + ) + parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark") @@ -238,6 +286,8 @@ def main(): seed=args.seed, prep_workers=args.prep_workers, rdkit_max_seconds=args.rdkit_max_seconds, + validate_count=0 if args.no_validate else args.validate_count, + validate_tol=args.validate_tol, no_rdkit=args.no_rdkit, no_nvmolkit=args.no_nvmolkit, output=args.output, From 3293e1f96054bde89ed0e54cc87da9eaced95332 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Tue, 2 Jun 2026 12:23:08 -0400 Subject: [PATCH 8/8] Fix std bug --- benchmarks/conformer_rmsd_bench.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index ee8d49ff..e9123a0b 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -188,7 +188,8 @@ def run( samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)] samples.sort(key=lambda pair: pair[0] / max(pair[1], 1)) rdkit_time_s, rdkit_done = samples[len(samples) // 2] - rdkit_std_s = statistics.stdev([elapsed for elapsed, _ in samples]) if len(samples) > 1 else 0.0 + per_mol_times = [elapsed / max(done, 1) for elapsed, done in samples] + rdkit_std_s = statistics.stdev(per_mol_times) * rdkit_done if len(samples) > 1 else 0.0 pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done]) rdkit_mols_per_s = rdkit_done / rdkit_time_s rdkit_pairs_per_s = pair_count_done / rdkit_time_s