diff --git a/benchmarks/conformer_rmsd_bench.py b/benchmarks/conformer_rmsd_bench.py index dbd688c5..e9123a0b 100644 --- a/benchmarks/conformer_rmsd_bench.py +++ b/benchmarks/conformer_rmsd_bench.py @@ -13,163 +13,286 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Benchmark: GPU vs CPU conformer RMSD matrix computation. +"""Benchmark: GPU vs single-threaded CPU pairwise conformer RMSD. Measures speedup of nvMolKit's GPU GetConformerRMSMatrix over RDKit's -CPU GetConformerRMSMatrix across varying conformer counts and molecule sizes. - -Usage: - python conformer_rmsd_bench.py - python conformer_rmsd_bench.py --num-confs 50 100 200 500 - python conformer_rmsd_bench.py --smiles "CCCCCCCCCC" --num-confs 500 +CPU GetConformerRMSMatrix across varying conformer counts. """ import argparse -import copy +import csv +import multiprocessing as mp +import statistics +import time +from pathlib import Path -import numpy as np import torch -from bench_utils import perturb_conformer +from bench_utils import Deadline, add_rdkit_max_seconds_arg, embed_and_jitter, load_smiles from benchmark_timing import time_it from rdkit import Chem -from rdkit.Chem import AllChem, rdDistGeom +from rdkit.Chem import AllChem -from nvmolkit.conformerRmsd import GetConformerRMSMatrix +from nvmolkit.conformerRmsd import GetConformerRMSMatrixBatch -def _numpy_kabsch_rmsd(p, q): - """Independent Kabsch RMSD using numpy SVD.""" - p_c = p - p.mean(axis=0) - q_c = q - q.mean(axis=0) - H = p_c.T @ q_c - S = np.linalg.svd(H, compute_uv=False) - d = np.sign(np.linalg.det(H)) - S[-1] *= d if d != 0.0 else 1.0 - Sp = np.sum(p_c**2) - Sq = np.sum(q_c**2) - return np.sqrt(max((Sp + Sq - 2.0 * np.sum(S)) / len(p), 0.0)) +def prepare_mols( + raw_mols: list[Chem.Mol], + confs_per_mol: int, + seed: int, + num_workers: int, +) -> list[Chem.Mol]: + """Embed one base conformer per mol, then perturb to ``confs_per_mol``.""" + workers = num_workers if num_workers > 0 else max(1, mp.cpu_count() // 2) + return embed_and_jitter( + raw_mols, + confs_per_mol=confs_per_mol, + seed=seed, + num_workers=workers, + add_hs=True, + min_atoms=2, + desc=f"Embed + perturb ({confs_per_mol} confs)", + ) -def benchmark_cpu(mol, n_warmup=1, n_iter=5): - """Benchmark RDKit CPU GetConformerRMSMatrix. +def bench_rdkit_batch(payloads: list[bytes], max_seconds: float) -> tuple[float, int]: + """One RDKit timing iteration over ``payloads``; returns ``(elapsed_s, n_done)``. - Deep-copies the molecule before each call because GetConformerRMSMatrix - modifies conformer coordinates in-place during alignment; reusing the - same molecule would measure already-aligned conformers and understate - the true CPU cost. + Stops once ``max_seconds`` is exceeded (``0`` means no cap). A fresh mol is + built per call because ``GetConformerRMSMatrix`` aligns conformers in place. """ - result = time_it( - lambda: AllChem.GetConformerRMSMatrix(copy.deepcopy(mol), prealigned=False), runs=n_iter, warmups=n_warmup - ) - return result.median_s + mols = [Chem.Mol(mol_bytes) for mol_bytes in payloads] + deadline = Deadline(max_seconds) + start = time.perf_counter() + n_done = 0 + for mol in mols: + AllChem.GetConformerRMSMatrix(mol, prealigned=False) + n_done += 1 + if deadline.expired(): + break + return time.perf_counter() - start, n_done -def benchmark_gpu(mol, n_warmup=2, n_iter=10): - """Benchmark nvMolKit GPU GetConformerRMSMatrix.""" - result = time_it( - lambda: GetConformerRMSMatrix(mol, prealigned=False), runs=n_iter, warmups=n_warmup, gpu_sync=True - ) - return result.median_s - - -def run_benchmark(smiles, num_confs_list, seed=42): - """Run CPU vs GPU benchmark for a molecule at various conformer counts.""" - mol_base = Chem.AddHs(Chem.MolFromSmiles(smiles)) - no_h_base = Chem.RemoveHs(Chem.AddHs(Chem.MolFromSmiles(smiles))) - n_atoms = no_h_base.GetNumAtoms() - - print(f"\nMolecule: {smiles} ({n_atoms} heavy atoms)") - print(f"{'Confs':>8} {'Pairs':>10} {'CPU (ms)':>10} {'GPU (ms)':>10} {'Speedup':>8} {'Match':>6}") - print("-" * 70) - - for num_confs in num_confs_list: - mol = Chem.RWMol(mol_base) - mol.RemoveAllConformers() - params = rdDistGeom.ETKDGv3() - params.randomSeed = seed - params.useRandomCoords = True - if rdDistGeom.EmbedMolecule(mol, params=params) < 0: - print(f"{num_confs:>8} {'skipped (embedding failed)':>50}") - continue - if num_confs < 2: - print(f"{num_confs:>8} {'skipped (need >= 2 confs for RMSD)':>50}") - continue - base_conf_id = mol.GetConformer().GetId() - for conf_idx in range(1, num_confs): - new_conf = Chem.Conformer(mol.GetConformer(base_conf_id)) - perturb_conformer(new_conf, seed=seed + conf_idx) - mol.AddConformer(new_conf, assignId=True) - perturb_conformer(mol.GetConformer(base_conf_id), seed=seed) - actual_confs = mol.GetNumConformers() - - no_h = Chem.RemoveHs(mol) - n_pairs = actual_confs * (actual_confs - 1) // 2 - - # CPU benchmark - cpu_time = benchmark_cpu(no_h) - - # GPU benchmark - gpu_time = benchmark_gpu(no_h) - - # Correctness check against numpy Kabsch SVD (sample up to 500 pairs) - gpu_result = GetConformerRMSMatrix(no_h, prealigned=False) - torch.cuda.synchronize() - gpu_rms = gpu_result.numpy().tolist() - - confs = no_h.GetConformers() - coords = [np.array(c.GetPositions()) for c in confs] - max_diff = 0.0 - count = 0 - max_checks = min(500, n_pairs) - for i in range(len(confs)): - for j in range(i): - idx = i * (i - 1) // 2 + j - ref = _numpy_kabsch_rmsd(coords[i], coords[j]) - max_diff = max(max_diff, abs(gpu_rms[idx] - ref)) - count += 1 - if count >= max_checks: - break - if count >= max_checks: - break - match_ok = max_diff < 0.05 - - speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf") - - print( - f"{actual_confs:>8} {n_pairs:>10} {cpu_time * 1000:>10.2f} " - f"{gpu_time * 1000:>10.2f} {speedup:>7.1f}x " - f"{'OK' if match_ok else f'FAIL ({max_diff:.4f})':>6}" - ) +def bench_gpu_batch(mols: list[Chem.Mol]) -> None: + results = GetConformerRMSMatrixBatch(mols, prealigned=False) + torch.cuda.synchronize() + + +def validate(mols: list[Chem.Mol], num_check: int, tol: float) -> None: + """Compare GPU RMSD matrices against RDKit on the first ``num_check`` mols. + + Raises ``RuntimeError`` on the first pair whose absolute diff exceeds ``tol``. + """ + subset = mols[:num_check] + if not subset: + return + print(f"\nValidation: comparing GPU vs RDKit on {len(subset)} mols (tol={tol})") + gpu_results = GetConformerRMSMatrixBatch(subset, prealigned=False) + torch.cuda.synchronize() + max_abs_diff = 0.0 + for mol_idx, mol in enumerate(subset): + rdkit_mol = Chem.Mol(mol.ToBinary()) + rdkit_rms = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) + gpu_rms = gpu_results[mol_idx].numpy().tolist() + if len(gpu_rms) != len(rdkit_rms): + raise RuntimeError( + f"validation: mol {mol_idx} pair count mismatch (gpu={len(gpu_rms)}, rdkit={len(rdkit_rms)})" + ) + for pair_idx, (gpu_val, rdkit_val) in enumerate(zip(gpu_rms, rdkit_rms)): + diff = abs(float(gpu_val) - float(rdkit_val)) + if diff > tol: + raise RuntimeError( + f"validation: mol {mol_idx} pair {pair_idx} diff {diff:.4f} > {tol} " + f"(gpu={gpu_val:.4f}, rdkit={rdkit_val:.4f})" + ) + if diff > max_abs_diff: + max_abs_diff = diff + print(f" OK (max abs diff {max_abs_diff:.5f})") + + +def _slice_to_confs(mols: list[Chem.Mol], target: int) -> list[Chem.Mol]: + """Return copies of ``mols`` keeping only the first ``target`` conformers each.""" + out: list[Chem.Mol] = [] + for mol in mols: + copy_mol = Chem.Mol(mol, True) # quickCopy: keeps graph, drops conformers + confs = list(mol.GetConformers())[:target] + for conf in confs: + copy_mol.AddConformer(Chem.Conformer(conf), assignId=True) + out.append(copy_mol) + return out + + +def run( + smiles_path: str, + num_mols: int, + confs_per_mol_list: list[int], + seed: int, + prep_workers: int, + rdkit_max_seconds: float, + validate_count: int, + validate_tol: float, + no_rdkit: bool, + no_nvmolkit: bool, + output: str | None, +) -> None: + if no_rdkit and no_nvmolkit: + raise ValueError("cannot disable both RDKit and nvMolKit") + if any(count < 2 for count in confs_per_mol_list): + raise ValueError("every --confs_per_mol value must be >= 2") + + if not no_nvmolkit: + print(f"GPU: {torch.cuda.get_device_name(0)} CUDA: {torch.version.cuda}") + print(f"Loading SMILES from {smiles_path} (target {num_mols} mols)") + raw = load_smiles(smiles_path, max_count=num_mols, sanitize=True, seed=seed) + + max_confs = max(confs_per_mol_list) + print(f"Preparing {len(raw)} mols x {max_confs} conformers (perturb-from-1-embed)") + base_mols = prepare_mols(raw, confs_per_mol=max_confs, seed=seed, num_workers=prep_workers) + if len(base_mols) > num_mols: + base_mols = base_mols[:num_mols] + if not base_mols: + raise RuntimeError("no molecules survived embedding") + + avg_atoms = sum(mol.GetNumAtoms() for mol in base_mols) / len(base_mols) + print(f" {len(base_mols)} mols, ~{avg_atoms:.1f} heavy atoms/mol") + if validate_count > 0 and not no_rdkit and not no_nvmolkit: + validate(_slice_to_confs(base_mols, max_confs), validate_count, validate_tol) + elif validate_count > 0: + print("\nValidation skipped (requires both --rdkit and --nvmolkit enabled)") + + print(f"\nSweeping confs_per_mol: {confs_per_mol_list}") + + rows: list[dict[str, float | int | str]] = [] + for target_confs in sorted(confs_per_mol_list): + mols = _slice_to_confs(base_mols, target_confs) + actual_confs = [mol.GetNumConformers() for mol in mols] + total_pairs = sum(count * (count - 1) // 2 for count in actual_confs) + print(f"\n=== confs_per_mol={target_confs}: {len(mols)} mols, {total_pairs} RMSD pairs ===") + + row: dict[str, float | int | str] = { + "num_mols": len(mols), + "confs_per_mol": target_confs, + "total_pairs": total_pairs, + "avg_heavy_atoms": avg_atoms, + } + + rdkit_mols_per_s: float | None = None + rdkit_pairs_per_s: float | None = None + if not no_rdkit: + payloads = [mol.ToBinary() for mol in mols] + cap_label = f"cap={rdkit_max_seconds:.0f}s" if rdkit_max_seconds > 0 else "no cap" + print(f" RDKit CPU (single-threaded, {cap_label}):") + # TODO: replace this hand-rolled warmup/sample/median loop with time_it once + # time_it can consume a Deadline and truncate a run mid-workload. + # https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/186 + bench_rdkit_batch(payloads, rdkit_max_seconds) # warmup + samples = [bench_rdkit_batch(payloads, rdkit_max_seconds) for _ in range(3)] + samples.sort(key=lambda pair: pair[0] / max(pair[1], 1)) + rdkit_time_s, rdkit_done = samples[len(samples) // 2] + per_mol_times = [elapsed / max(done, 1) for elapsed, done in samples] + rdkit_std_s = statistics.stdev(per_mol_times) * rdkit_done if len(samples) > 1 else 0.0 + pair_count_done = sum(count * (count - 1) // 2 for count in actual_confs[:rdkit_done]) + rdkit_mols_per_s = rdkit_done / rdkit_time_s + rdkit_pairs_per_s = pair_count_done / rdkit_time_s + truncated = rdkit_done < len(mols) + suffix = f" [truncated at {rdkit_done}/{len(mols)} mols]" if truncated else "" + print( + f" median wall: {rdkit_time_s * 1000:.1f} +/- {rdkit_std_s * 1000:.1f} ms over {rdkit_done} mols " + f"({rdkit_mols_per_s:.1f} mols/s, {rdkit_pairs_per_s:.0f} pairs/s){suffix}" + ) + row["rdkit_median_s"] = rdkit_time_s + row["rdkit_std_s"] = rdkit_std_s + row["rdkit_mols_processed"] = rdkit_done + row["rdkit_truncated"] = int(truncated) + row["rdkit_mols_per_s"] = rdkit_mols_per_s + row["rdkit_pairs_per_s"] = rdkit_pairs_per_s + + gpu_pairs_per_s: float | None = None + if not no_nvmolkit: + print(" nvMolKit GPU (batched):") + result = time_it(lambda: bench_gpu_batch(mols), runs=5, warmups=2, gpu_sync=True) + gpu_time_s = result.median_s + gpu_std_s = result.std_ms / 1000.0 + gpu_pairs_per_s = total_pairs / gpu_time_s + print( + f" median wall: {gpu_time_s * 1000:.1f} +/- {gpu_std_s * 1000:.1f} ms " + f"({len(mols) / gpu_time_s:.1f} mols/s, {gpu_pairs_per_s:.0f} pairs/s)" + ) + row["gpu_median_s"] = gpu_time_s + row["gpu_std_s"] = gpu_std_s + row["gpu_mols_per_s"] = len(mols) / gpu_time_s + row["gpu_pairs_per_s"] = gpu_pairs_per_s + + if rdkit_pairs_per_s is not None and gpu_pairs_per_s is not None: + row["speedup"] = gpu_pairs_per_s / rdkit_pairs_per_s + print(f" GPU speedup vs single-threaded RDKit (pairs/s): {row['speedup']:.1f}x") + + rows.append(row) + + if output and rows: + out_path = Path(output) + out_path.parent.mkdir(parents=True, exist_ok=True) + fieldnames: list[str] = [] + for row in rows: + for key in row: + if key not in fieldnames: + fieldnames.append(key) + with out_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + print(f"\nWrote {out_path}") def main(): - parser = argparse.ArgumentParser(description="Benchmark GPU vs CPU conformer RMSD matrix") + parser = argparse.ArgumentParser(description="Conformer RMSD batch benchmark") + parser.add_argument("--smiles", required=True, help="Path to smiles file") + parser.add_argument("--num_mols", type=int, default=2000, help="Number of molecules to sample") parser.add_argument( - "--smiles", + "--confs_per_mol", + type=int, nargs="+", - default=[ - "CC(=O)Oc1ccccc1C(=O)O", # aspirin (13 HA) - "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", # celecoxib (24 HA) - "C=CC(=O)Nc1cc(OC)c(Nc2nccc(-c3cn(C)c4ccccc34)n2)cc1N(C)CCN(C)C", # osimertinib (33 HA) - "CC(C)CC1NC(=O)C(CC(=O)O)NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(CO)NC(=O)C(Cc2c[nH]c3ccccc23)NC1=O", # cyclic pentapeptide (~48 HA) - ], - help="SMILES strings to benchmark", + default=[10, 25, 50, 100, 200], + help="Conformers-per-molecule sweep points (each >=2)", ) parser.add_argument( - "--num-confs", - nargs="+", + "--prep_workers", type=int, default=0, help="Workers for the embed-and-perturb prep step (0 = half of CPUs)" + ) + add_rdkit_max_seconds_arg( + parser, + extra_help="The cap applies per timing iteration and truncates at a molecule boundary.", + ) + parser.add_argument( + "--validate_count", type=int, - default=[10, 50, 100, 200, 500], - help="Number of conformers to generate", + default=8, + help="Number of mols to compare GPU vs RDKit before timing (0 disables; requires both backends enabled)", ) + parser.add_argument( + "--validate_tol", type=float, default=0.05, help="Absolute tolerance (Angstroms) for per-pair RMSD diff" + ) + parser.add_argument("--no_validate", action="store_true", help="Skip the GPU-vs-RDKit correctness check") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--output", type=str, default=None, help="Optional CSV output path") + parser.add_argument("--no_rdkit", action="store_true", help="Skip RDKit CPU benchmark") + parser.add_argument("--no_nvmolkit", action="store_true", help="Skip nvMolKit GPU benchmark") args = parser.parse_args() - device_name = torch.cuda.get_device_name(0) - print(f"GPU: {device_name}") - print(f"CUDA: {torch.version.cuda}") + if args.no_rdkit and args.no_nvmolkit: + parser.error("cannot pass both --no_rdkit and --no_nvmolkit") - for smiles in args.smiles: - run_benchmark(smiles, args.num_confs) + run( + smiles_path=args.smiles, + num_mols=args.num_mols, + confs_per_mol_list=args.confs_per_mol, + seed=args.seed, + prep_workers=args.prep_workers, + rdkit_max_seconds=args.rdkit_max_seconds, + validate_count=0 if args.no_validate else args.validate_count, + validate_tol=args.validate_tol, + no_rdkit=args.no_rdkit, + no_nvmolkit=args.no_nvmolkit, + output=args.output, + ) print("\nDone.")