From 4d847d24b0454a6d3071f7a7f96adf7dbaebabf2 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Fri, 29 May 2026 14:16:17 -0400 Subject: [PATCH 1/3] butina: opt-in rdkit_lowmem; split rdkit columns; allow synthetic-fill - Flip `--no-rdkit-lowmem` to opt-in `--rdkit-lowmem`. The lowmem backend builds its distance matrix in a pure-Python O(n^2) loop that doesn't finish in reasonable time at sizes >= 40k, so default off. - Drop `--include-tanimoto-matrix` and always report both rdkit variants (`rdkit_cluster_only_*` and `rdkit_with_tanimoto_*`) so a single run characterises both the inner-loop and the with-distance-build timings. - Allow rdkit_cluster_only / fused / nvmolkit to run on synthetic-fill fingerprints when the input has fewer mols than `size`; with_tanimoto and lowmem rows are skipped in that case (they need real fingerprints). - Edge-case cutoff changes from 1.1 to 1.0. --- benchmarks/butina_clustering_bench.py | 141 ++++++++++++++++---------- 1 file changed, 85 insertions(+), 56 deletions(-) diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py index f1c5c4e..bccd617 100644 --- a/benchmarks/butina_clustering_bench.py +++ b/benchmarks/butina_clustering_bench.py @@ -150,11 +150,14 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): parser.add_argument("--no-rdkit", action="store_true", help="Disable RDKit benchmarks") parser.add_argument("--no-fused", action="store_true", help="Disable fused Butina benchmarks") parser.add_argument("--no-nvmolkit", action="store_true", help="Disable nvMolKit Butina benchmarks") - parser.add_argument("--no-rdkit-lowmem", action="store_true", help="Disable RDKit low-memory benchmarks") parser.add_argument( - "--include-tanimoto-matrix", + "--rdkit-lowmem", action="store_true", - help="Include tanimoto matrix calculation in butina timing (for fair comparison with fused_butina)", + help=( + "Enable the RDKit low-memory backend. Off by default because its " + "distance-matrix builder is a pure-Python O(n^2) loop that does " + "not finish in reasonable wall time at sizes >= 40k." + ), ) parser.add_argument( "--config", @@ -170,7 +173,6 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): ) args = parser.parse_args() - include_tanimoto = args.include_tanimoto_matrix n_runs = args.runs disabled = set() if args.no_rdkit: @@ -179,7 +181,7 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): disabled.add("fused") if args.no_nvmolkit: disabled.add("nvmolkit") - if args.no_rdkit_lowmem: + if not args.rdkit_lowmem: disabled.add("rdkit_lowmem") if args.config: @@ -211,35 +213,25 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): mols = load_smiles(args.input_smiles_file, max_count=max_size + 100, sanitize=True, seed=args.seed) - if include_tanimoto and len(mols) < max_size: - print( - f"Error: --include-tanimoto-matrix requires at least {max_size} molecules, " - f"but only {len(mols)} were loaded from input", - file=sys.stderr, - ) - sys.exit(1) - fps = get_fingerprints(mols) + # rdkit_with_tanimoto and rdkit_lowmem both require real RDKit fingerprints + # for every molecule up to the requested size. The precomputed-matrix rdkit + # path (rdkit_cluster_only) can still run on synthetic distances when the + # input has fewer mols than `size`, but the with_tanimoto / lowmem rows + # for that size are skipped in that case. max_rdkit_fps_size = max( - (e["size"] for e in run_plan if "rdkit_lowmem" in e["run"] or ("rdkit" in e["run"] and include_tanimoto)), + (e["size"] for e in run_plan if "rdkit" in e["run"] or "rdkit_lowmem" in e["run"]), default=0, ) if max_rdkit_fps_size > 0: - if len(mols) < max_rdkit_fps_size: - print( - f"Error: rdkit benchmarks with fingerprints require at least {max_rdkit_fps_size} " - f"molecules, but only {len(mols)} were loaded from input", - file=sys.stderr, - ) - sys.exit(1) rdkit_fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024) rdkit_fps = [rdkit_fpgen.GetFingerprint(mol) for mol in mols] else: rdkit_fps = None output_path = args.output - cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.1] + cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.0] default_nl_sizes = [8, 16, 32, 64, 128] results = [] @@ -254,34 +246,58 @@ def save_results(): runs = entry["run"] max_nl_sizes = entry.get("neighborlist_sizes", default_nl_sizes) - need_dist = "nvmolkit" in runs or ("rdkit" in runs and not include_tanimoto) - if include_tanimoto: - fps_mat = fps[:size].contiguous() - if need_dist: - dist_mat = 1.0 - crossTanimotoSimilarity(fps_mat).torch() + # rdkit_with_tanimoto and rdkit_lowmem need real fingerprints up to + # `size`. If the input has fewer mols, those rows are skipped for + # this size (cluster_only and fused still run on synthetic data). + have_real_fps_for_size = rdkit_fps is not None and len(rdkit_fps) >= size + + need_real_fps_mat = "fused" in runs or "nvmolkit" in runs + if need_real_fps_mat and len(mols) >= size: + fps_mat_real = fps[:size].contiguous() + else: + fps_mat_real = None + if need_real_fps_mat and fps_mat_real is None: + fps_mat_synth = resize_and_fill_fingerprints(fps, size) else: - fps_mat = resize_and_fill_fingerprints(fps, size) - if need_dist: - real_size = min(size, len(mols)) + fps_mat_synth = None + fps_mat = fps_mat_real if fps_mat_real is not None else fps_mat_synth + + need_dist = "nvmolkit" in runs or "rdkit" in runs + if need_dist: + real_size = min(size, len(mols)) + if "nvmolkit" in runs or "fused" in runs: base_dists = 1.0 - crossTanimotoSimilarity(fps[:real_size]).torch() + else: + rdkit_dist = np.empty((real_size, real_size), dtype=np.float64) + for i in range(real_size): + rdkit_dist[i] = BulkTanimotoSimilarity(rdkit_fps[i], rdkit_fps[:real_size]) + np.subtract(1.0, rdkit_dist, out=rdkit_dist) + base_dists = torch.from_numpy(rdkit_dist) + if real_size >= size: + dist_mat = base_dists.contiguous() + else: dist_mat = resize_and_fill(base_dists, size) del base_dists for cutoff in cutoffs: # Don't run large sizes for edge cases. - if cutoff in (1e-10, 1.1) and size > 20000: + if cutoff in (1e-10, 1.0) and size > 20000: continue - rdkit_time, rdk_std = float("nan"), float("nan") + rdkit_co_time, rdkit_co_std = float("nan"), float("nan") + rdkit_wt_time, rdkit_wt_std = float("nan"), float("nan") if "rdkit" in runs: - if include_tanimoto: - rdkit_time, rdk_std = bench_rdkit_with_tanimoto(rdkit_fps[:size], cutoff, runs=n_runs) - else: - dist_mat_numpy = dist_mat.cpu().numpy() - rdkit_time, rdk_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs) + print(f"Running rdkit_cluster_only size {size} cutoff {cutoff}") + dist_mat_numpy = dist_mat.cpu().numpy() + rdkit_co_time, rdkit_co_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs) + if have_real_fps_for_size: + print(f"Running rdkit_with_tanimoto size {size} cutoff {cutoff}") + rdkit_wt_time, rdkit_wt_std = bench_rdkit_with_tanimoto( + rdkit_fps[:size], cutoff, runs=n_runs + ) rdkit_lm_time, rdkit_lm_std = float("nan"), float("nan") - if "rdkit_lowmem" in runs: + if "rdkit_lowmem" in runs and have_real_fps_for_size: print(f"Running rdkit_lowmem size {size} cutoff {cutoff}") rdkit_lm_time, rdkit_lm_std = bench_rdkit_lowmem(rdkit_fps[:size], cutoff, runs=n_runs) @@ -297,20 +313,25 @@ def save_results(): if "nvmolkit" in runs: for max_nl in max_nl_sizes: - print(f"Running nvmolkit size {size} cutoff {cutoff} max_nl {max_nl}") - if include_tanimoto: - nvmol_result = time_it( - lambda: bench_nvmol_with_tanimoto(fps_mat, cutoff, max_nl), - gpu_sync=True, - runs=n_runs, + print(f"Running nvmolkit_cluster_only size {size} cutoff {cutoff} max_nl {max_nl}") + nvmol_co_result = time_it( + lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl), + gpu_sync=True, + runs=n_runs, + ) + nvmol_co_time, nvmol_co_std = nvmol_co_result.mean_ms, nvmol_co_result.std_ms + + nvmol_wt_time, nvmol_wt_std = float("nan"), float("nan") + if fps_mat_real is not None: + print( + f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}" ) - else: - nvmol_result = time_it( - lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl), + nvmol_wt_result = time_it( + lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl), gpu_sync=True, runs=n_runs, ) - nvmol_time, nvmol_std = nvmol_result.mean_ms, nvmol_result.std_ms + nvmol_wt_time, nvmol_wt_std = nvmol_wt_result.mean_ms, nvmol_wt_result.std_ms nvmol_res = butina_nvmol(dist_mat, cutoff, neighborlist_max_size=max_nl).torch() torch.cuda.synchronize() @@ -325,12 +346,16 @@ def save_results(): "size": size, "cutoff": cutoff, "max_neighborlist_size": max_nl, - "rdkit_time_ms": rdkit_time, - "rdkit_std_ms": rdk_std, + "rdkit_cluster_only_time_ms": rdkit_co_time, + "rdkit_cluster_only_std_ms": rdkit_co_std, + "rdkit_with_tanimoto_time_ms": rdkit_wt_time, + "rdkit_with_tanimoto_std_ms": rdkit_wt_std, "rdkit_lowmem_time_ms": rdkit_lm_time, "rdkit_lowmem_std_ms": rdkit_lm_std, - "nvmol_time_ms": nvmol_time, - "nvmol_std_ms": nvmol_std, + "nvmolkit_cluster_only_time_ms": nvmol_co_time, + "nvmolkit_cluster_only_std_ms": nvmol_co_std, + "nvmolkit_with_tanimoto_time_ms": nvmol_wt_time, + "nvmolkit_with_tanimoto_std_ms": nvmol_wt_std, "fused_butina_time_ms": fused_time, "fused_butina_std_ms": fused_std, } @@ -341,12 +366,16 @@ def save_results(): "size": size, "cutoff": cutoff, "max_neighborlist_size": float("nan"), - "rdkit_time_ms": rdkit_time, - "rdkit_std_ms": rdk_std, + "rdkit_cluster_only_time_ms": rdkit_co_time, + "rdkit_cluster_only_std_ms": rdkit_co_std, + "rdkit_with_tanimoto_time_ms": rdkit_wt_time, + "rdkit_with_tanimoto_std_ms": rdkit_wt_std, "rdkit_lowmem_time_ms": rdkit_lm_time, "rdkit_lowmem_std_ms": rdkit_lm_std, - "nvmol_time_ms": float("nan"), - "nvmol_std_ms": float("nan"), + "nvmolkit_cluster_only_time_ms": float("nan"), + "nvmolkit_cluster_only_std_ms": float("nan"), + "nvmolkit_with_tanimoto_time_ms": float("nan"), + "nvmolkit_with_tanimoto_std_ms": float("nan"), "fused_butina_time_ms": fused_time, "fused_butina_std_ms": fused_std, } From a02a45fc80a887693378ae01ab952f8556a56921 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Fri, 29 May 2026 17:35:53 -0400 Subject: [PATCH 2/3] butina: use full names for rdkit/nvmolkit timing vars; trim duplicate comments --- benchmarks/butina_clustering_bench.py | 56 +++++++++++++-------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py index bccd617..9363efe 100644 --- a/benchmarks/butina_clustering_bench.py +++ b/benchmarks/butina_clustering_bench.py @@ -215,11 +215,8 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): fps = get_fingerprints(mols) - # rdkit_with_tanimoto and rdkit_lowmem both require real RDKit fingerprints - # for every molecule up to the requested size. The precomputed-matrix rdkit - # path (rdkit_cluster_only) can still run on synthetic distances when the - # input has fewer mols than `size`, but the with_tanimoto / lowmem rows - # for that size are skipped in that case. + # All three rdkit paths (cluster_only, with_tanimoto, lowmem) need real + # RDKit fingerprints, so build them once if any rdkit row is planned. max_rdkit_fps_size = max( (e["size"] for e in run_plan if "rdkit" in e["run"] or "rdkit_lowmem" in e["run"]), default=0, @@ -246,9 +243,8 @@ def save_results(): runs = entry["run"] max_nl_sizes = entry.get("neighborlist_sizes", default_nl_sizes) - # rdkit_with_tanimoto and rdkit_lowmem need real fingerprints up to - # `size`. If the input has fewer mols, those rows are skipped for - # this size (cluster_only and fused still run on synthetic data). + # with_tanimoto and lowmem need real fingerprints for every mol up to + # `size`; when the input is smaller, those rows are skipped here. have_real_fps_for_size = rdkit_fps is not None and len(rdkit_fps) >= size need_real_fps_mat = "fused" in runs or "nvmolkit" in runs @@ -284,15 +280,15 @@ def save_results(): if cutoff in (1e-10, 1.0) and size > 20000: continue - rdkit_co_time, rdkit_co_std = float("nan"), float("nan") - rdkit_wt_time, rdkit_wt_std = float("nan"), float("nan") + rdkit_cluster_only_time, rdkit_cluster_only_std = float("nan"), float("nan") + rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = float("nan"), float("nan") if "rdkit" in runs: print(f"Running rdkit_cluster_only size {size} cutoff {cutoff}") dist_mat_numpy = dist_mat.cpu().numpy() - rdkit_co_time, rdkit_co_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs) + rdkit_cluster_only_time, rdkit_cluster_only_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs) if have_real_fps_for_size: print(f"Running rdkit_with_tanimoto size {size} cutoff {cutoff}") - rdkit_wt_time, rdkit_wt_std = bench_rdkit_with_tanimoto( + rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = bench_rdkit_with_tanimoto( rdkit_fps[:size], cutoff, runs=n_runs ) @@ -314,24 +310,26 @@ def save_results(): if "nvmolkit" in runs: for max_nl in max_nl_sizes: print(f"Running nvmolkit_cluster_only size {size} cutoff {cutoff} max_nl {max_nl}") - nvmol_co_result = time_it( + nvmolkit_cluster_only_result = time_it( lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl), gpu_sync=True, runs=n_runs, ) - nvmol_co_time, nvmol_co_std = nvmol_co_result.mean_ms, nvmol_co_result.std_ms + nvmolkit_cluster_only_time = nvmolkit_cluster_only_result.mean_ms + nvmolkit_cluster_only_std = nvmolkit_cluster_only_result.std_ms - nvmol_wt_time, nvmol_wt_std = float("nan"), float("nan") + nvmolkit_with_tanimoto_time, nvmolkit_with_tanimoto_std = float("nan"), float("nan") if fps_mat_real is not None: print( f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}" ) - nvmol_wt_result = time_it( + nvmolkit_with_tanimoto_result = time_it( lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl), gpu_sync=True, runs=n_runs, ) - nvmol_wt_time, nvmol_wt_std = nvmol_wt_result.mean_ms, nvmol_wt_result.std_ms + nvmolkit_with_tanimoto_time = nvmolkit_with_tanimoto_result.mean_ms + nvmolkit_with_tanimoto_std = nvmolkit_with_tanimoto_result.std_ms nvmol_res = butina_nvmol(dist_mat, cutoff, neighborlist_max_size=max_nl).torch() torch.cuda.synchronize() @@ -346,16 +344,16 @@ def save_results(): "size": size, "cutoff": cutoff, "max_neighborlist_size": max_nl, - "rdkit_cluster_only_time_ms": rdkit_co_time, - "rdkit_cluster_only_std_ms": rdkit_co_std, - "rdkit_with_tanimoto_time_ms": rdkit_wt_time, - "rdkit_with_tanimoto_std_ms": rdkit_wt_std, + "rdkit_cluster_only_time_ms": rdkit_cluster_only_time, + "rdkit_cluster_only_std_ms": rdkit_cluster_only_std, + "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time, + "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std, "rdkit_lowmem_time_ms": rdkit_lm_time, "rdkit_lowmem_std_ms": rdkit_lm_std, - "nvmolkit_cluster_only_time_ms": nvmol_co_time, - "nvmolkit_cluster_only_std_ms": nvmol_co_std, - "nvmolkit_with_tanimoto_time_ms": nvmol_wt_time, - "nvmolkit_with_tanimoto_std_ms": nvmol_wt_std, + "nvmolkit_cluster_only_time_ms": nvmolkit_cluster_only_time, + "nvmolkit_cluster_only_std_ms": nvmolkit_cluster_only_std, + "nvmolkit_with_tanimoto_time_ms": nvmolkit_with_tanimoto_time, + "nvmolkit_with_tanimoto_std_ms": nvmolkit_with_tanimoto_std, "fused_butina_time_ms": fused_time, "fused_butina_std_ms": fused_std, } @@ -366,10 +364,10 @@ def save_results(): "size": size, "cutoff": cutoff, "max_neighborlist_size": float("nan"), - "rdkit_cluster_only_time_ms": rdkit_co_time, - "rdkit_cluster_only_std_ms": rdkit_co_std, - "rdkit_with_tanimoto_time_ms": rdkit_wt_time, - "rdkit_with_tanimoto_std_ms": rdkit_wt_std, + "rdkit_cluster_only_time_ms": rdkit_cluster_only_time, + "rdkit_cluster_only_std_ms": rdkit_cluster_only_std, + "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time, + "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std, "rdkit_lowmem_time_ms": rdkit_lm_time, "rdkit_lowmem_std_ms": rdkit_lm_std, "nvmolkit_cluster_only_time_ms": float("nan"), From 258f7b08722ee93363ed3b8956d6940a5bc1dacd Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Mon, 1 Jun 2026 08:39:10 -0400 Subject: [PATCH 3/3] butina: collapse nvmolkit_with_tanimoto print onto one line --- benchmarks/butina_clustering_bench.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py index 9363efe..dde235b 100644 --- a/benchmarks/butina_clustering_bench.py +++ b/benchmarks/butina_clustering_bench.py @@ -320,9 +320,7 @@ def save_results(): nvmolkit_with_tanimoto_time, nvmolkit_with_tanimoto_std = float("nan"), float("nan") if fps_mat_real is not None: - print( - f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}" - ) + print(f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}") nvmolkit_with_tanimoto_result = time_it( lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl), gpu_sync=True,