From 4d847d24b0454a6d3071f7a7f96adf7dbaebabf2 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Fri, 29 May 2026 14:16:17 -0400
Subject: [PATCH 1/3] butina: opt-in rdkit_lowmem; split rdkit columns; allow
 synthetic-fill

- Flip `--no-rdkit-lowmem` to opt-in `--rdkit-lowmem`. The lowmem backend
  builds its distance matrix in a pure-Python O(n^2) loop that doesn't
  finish in reasonable time at sizes >= 40k, so default off.
- Drop `--include-tanimoto-matrix` and always report both rdkit variants
  (`rdkit_cluster_only_*` and `rdkit_with_tanimoto_*`) so a single run
  characterises both the inner-loop and the with-distance-build timings.
- Allow rdkit_cluster_only / fused / nvmolkit to run on synthetic-fill
  fingerprints when the input has fewer mols than `size`; with_tanimoto
  and lowmem rows are skipped in that case (they need real fingerprints).
- Edge-case cutoff changes from 1.1 to 1.0.
---
 benchmarks/butina_clustering_bench.py | 141 ++++++++++++++++----------
 1 file changed, 85 insertions(+), 56 deletions(-)

diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py
index f1c5c4e..bccd617 100644
--- a/benchmarks/butina_clustering_bench.py
+++ b/benchmarks/butina_clustering_bench.py
@@ -150,11 +150,14 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
     parser.add_argument("--no-rdkit", action="store_true", help="Disable RDKit benchmarks")
     parser.add_argument("--no-fused", action="store_true", help="Disable fused Butina benchmarks")
     parser.add_argument("--no-nvmolkit", action="store_true", help="Disable nvMolKit Butina benchmarks")
-    parser.add_argument("--no-rdkit-lowmem", action="store_true", help="Disable RDKit low-memory benchmarks")
     parser.add_argument(
-        "--include-tanimoto-matrix",
+        "--rdkit-lowmem",
         action="store_true",
-        help="Include tanimoto matrix calculation in butina timing (for fair comparison with fused_butina)",
+        help=(
+            "Enable the RDKit low-memory backend. Off by default because its "
+            "distance-matrix builder is a pure-Python O(n^2) loop that does "
+            "not finish in reasonable wall time at sizes >= 40k."
+        ),
     )
     parser.add_argument(
         "--config",
@@ -170,7 +173,6 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
     )
     args = parser.parse_args()
 
-    include_tanimoto = args.include_tanimoto_matrix
     n_runs = args.runs
     disabled = set()
     if args.no_rdkit:
@@ -179,7 +181,7 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
         disabled.add("fused")
     if args.no_nvmolkit:
         disabled.add("nvmolkit")
-    if args.no_rdkit_lowmem:
+    if not args.rdkit_lowmem:
         disabled.add("rdkit_lowmem")
 
     if args.config:
@@ -211,35 +213,25 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
 
     mols = load_smiles(args.input_smiles_file, max_count=max_size + 100, sanitize=True, seed=args.seed)
 
-    if include_tanimoto and len(mols) < max_size:
-        print(
-            f"Error: --include-tanimoto-matrix requires at least {max_size} molecules, "
-            f"but only {len(mols)} were loaded from input",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
     fps = get_fingerprints(mols)
 
+    # rdkit_with_tanimoto and rdkit_lowmem both require real RDKit fingerprints
+    # for every molecule up to the requested size. The precomputed-matrix rdkit
+    # path (rdkit_cluster_only) can still run on synthetic distances when the
+    # input has fewer mols than `size`, but the with_tanimoto / lowmem rows
+    # for that size are skipped in that case.
     max_rdkit_fps_size = max(
-        (e["size"] for e in run_plan if "rdkit_lowmem" in e["run"] or ("rdkit" in e["run"] and include_tanimoto)),
+        (e["size"] for e in run_plan if "rdkit" in e["run"] or "rdkit_lowmem" in e["run"]),
         default=0,
     )
     if max_rdkit_fps_size > 0:
-        if len(mols) < max_rdkit_fps_size:
-            print(
-                f"Error: rdkit benchmarks with fingerprints require at least {max_rdkit_fps_size} "
-                f"molecules, but only {len(mols)} were loaded from input",
-                file=sys.stderr,
-            )
-            sys.exit(1)
         rdkit_fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024)
         rdkit_fps = [rdkit_fpgen.GetFingerprint(mol) for mol in mols]
     else:
         rdkit_fps = None
 
     output_path = args.output
-    cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.1]
+    cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.0]
     default_nl_sizes = [8, 16, 32, 64, 128]
     results = []
 
@@ -254,34 +246,58 @@ def save_results():
             runs = entry["run"]
             max_nl_sizes = entry.get("neighborlist_sizes", default_nl_sizes)
 
-            need_dist = "nvmolkit" in runs or ("rdkit" in runs and not include_tanimoto)
-            if include_tanimoto:
-                fps_mat = fps[:size].contiguous()
-                if need_dist:
-                    dist_mat = 1.0 - crossTanimotoSimilarity(fps_mat).torch()
+            # rdkit_with_tanimoto and rdkit_lowmem need real fingerprints up to
+            # `size`. If the input has fewer mols, those rows are skipped for
+            # this size (cluster_only and fused still run on synthetic data).
+            have_real_fps_for_size = rdkit_fps is not None and len(rdkit_fps) >= size
+
+            need_real_fps_mat = "fused" in runs or "nvmolkit" in runs
+            if need_real_fps_mat and len(mols) >= size:
+                fps_mat_real = fps[:size].contiguous()
+            else:
+                fps_mat_real = None
+            if need_real_fps_mat and fps_mat_real is None:
+                fps_mat_synth = resize_and_fill_fingerprints(fps, size)
             else:
-                fps_mat = resize_and_fill_fingerprints(fps, size)
-                if need_dist:
-                    real_size = min(size, len(mols))
+                fps_mat_synth = None
+            fps_mat = fps_mat_real if fps_mat_real is not None else fps_mat_synth
+
+            need_dist = "nvmolkit" in runs or "rdkit" in runs
+            if need_dist:
+                real_size = min(size, len(mols))
+                if "nvmolkit" in runs or "fused" in runs:
                     base_dists = 1.0 - crossTanimotoSimilarity(fps[:real_size]).torch()
+                else:
+                    rdkit_dist = np.empty((real_size, real_size), dtype=np.float64)
+                    for i in range(real_size):
+                        rdkit_dist[i] = BulkTanimotoSimilarity(rdkit_fps[i], rdkit_fps[:real_size])
+                    np.subtract(1.0, rdkit_dist, out=rdkit_dist)
+                    base_dists = torch.from_numpy(rdkit_dist)
+                if real_size >= size:
+                    dist_mat = base_dists.contiguous()
+                else:
                     dist_mat = resize_and_fill(base_dists, size)
                     del base_dists
 
             for cutoff in cutoffs:
                 # Don't run large sizes for edge cases.
-                if cutoff in (1e-10, 1.1) and size > 20000:
+                if cutoff in (1e-10, 1.0) and size > 20000:
                     continue
 
-                rdkit_time, rdk_std = float("nan"), float("nan")
+                rdkit_co_time, rdkit_co_std = float("nan"), float("nan")
+                rdkit_wt_time, rdkit_wt_std = float("nan"), float("nan")
                 if "rdkit" in runs:
-                    if include_tanimoto:
-                        rdkit_time, rdk_std = bench_rdkit_with_tanimoto(rdkit_fps[:size], cutoff, runs=n_runs)
-                    else:
-                        dist_mat_numpy = dist_mat.cpu().numpy()
-                        rdkit_time, rdk_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
+                    print(f"Running rdkit_cluster_only size {size} cutoff {cutoff}")
+                    dist_mat_numpy = dist_mat.cpu().numpy()
+                    rdkit_co_time, rdkit_co_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
+                    if have_real_fps_for_size:
+                        print(f"Running rdkit_with_tanimoto size {size} cutoff {cutoff}")
+                        rdkit_wt_time, rdkit_wt_std = bench_rdkit_with_tanimoto(
+                            rdkit_fps[:size], cutoff, runs=n_runs
+                        )
 
                 rdkit_lm_time, rdkit_lm_std = float("nan"), float("nan")
-                if "rdkit_lowmem" in runs:
+                if "rdkit_lowmem" in runs and have_real_fps_for_size:
                     print(f"Running rdkit_lowmem size {size} cutoff {cutoff}")
                     rdkit_lm_time, rdkit_lm_std = bench_rdkit_lowmem(rdkit_fps[:size], cutoff, runs=n_runs)
 
@@ -297,20 +313,25 @@ def save_results():
 
                 if "nvmolkit" in runs:
                     for max_nl in max_nl_sizes:
-                        print(f"Running nvmolkit size {size} cutoff {cutoff} max_nl {max_nl}")
-                        if include_tanimoto:
-                            nvmol_result = time_it(
-                                lambda: bench_nvmol_with_tanimoto(fps_mat, cutoff, max_nl),
-                                gpu_sync=True,
-                                runs=n_runs,
+                        print(f"Running nvmolkit_cluster_only size {size} cutoff {cutoff} max_nl {max_nl}")
+                        nvmol_co_result = time_it(
+                            lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl),
+                            gpu_sync=True,
+                            runs=n_runs,
+                        )
+                        nvmol_co_time, nvmol_co_std = nvmol_co_result.mean_ms, nvmol_co_result.std_ms
+
+                        nvmol_wt_time, nvmol_wt_std = float("nan"), float("nan")
+                        if fps_mat_real is not None:
+                            print(
+                                f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}"
                             )
-                        else:
-                            nvmol_result = time_it(
-                                lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl),
+                            nvmol_wt_result = time_it(
+                                lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl),
                                 gpu_sync=True,
                                 runs=n_runs,
                             )
-                        nvmol_time, nvmol_std = nvmol_result.mean_ms, nvmol_result.std_ms
+                            nvmol_wt_time, nvmol_wt_std = nvmol_wt_result.mean_ms, nvmol_wt_result.std_ms
 
                         nvmol_res = butina_nvmol(dist_mat, cutoff, neighborlist_max_size=max_nl).torch()
                         torch.cuda.synchronize()
@@ -325,12 +346,16 @@ def save_results():
                                 "size": size,
                                 "cutoff": cutoff,
                                 "max_neighborlist_size": max_nl,
-                                "rdkit_time_ms": rdkit_time,
-                                "rdkit_std_ms": rdk_std,
+                                "rdkit_cluster_only_time_ms": rdkit_co_time,
+                                "rdkit_cluster_only_std_ms": rdkit_co_std,
+                                "rdkit_with_tanimoto_time_ms": rdkit_wt_time,
+                                "rdkit_with_tanimoto_std_ms": rdkit_wt_std,
                                 "rdkit_lowmem_time_ms": rdkit_lm_time,
                                 "rdkit_lowmem_std_ms": rdkit_lm_std,
-                                "nvmol_time_ms": nvmol_time,
-                                "nvmol_std_ms": nvmol_std,
+                                "nvmolkit_cluster_only_time_ms": nvmol_co_time,
+                                "nvmolkit_cluster_only_std_ms": nvmol_co_std,
+                                "nvmolkit_with_tanimoto_time_ms": nvmol_wt_time,
+                                "nvmolkit_with_tanimoto_std_ms": nvmol_wt_std,
                                 "fused_butina_time_ms": fused_time,
                                 "fused_butina_std_ms": fused_std,
                             }
@@ -341,12 +366,16 @@ def save_results():
                             "size": size,
                             "cutoff": cutoff,
                             "max_neighborlist_size": float("nan"),
-                            "rdkit_time_ms": rdkit_time,
-                            "rdkit_std_ms": rdk_std,
+                            "rdkit_cluster_only_time_ms": rdkit_co_time,
+                            "rdkit_cluster_only_std_ms": rdkit_co_std,
+                            "rdkit_with_tanimoto_time_ms": rdkit_wt_time,
+                            "rdkit_with_tanimoto_std_ms": rdkit_wt_std,
                             "rdkit_lowmem_time_ms": rdkit_lm_time,
                             "rdkit_lowmem_std_ms": rdkit_lm_std,
-                            "nvmol_time_ms": float("nan"),
-                            "nvmol_std_ms": float("nan"),
+                            "nvmolkit_cluster_only_time_ms": float("nan"),
+                            "nvmolkit_cluster_only_std_ms": float("nan"),
+                            "nvmolkit_with_tanimoto_time_ms": float("nan"),
+                            "nvmolkit_with_tanimoto_std_ms": float("nan"),
                             "fused_butina_time_ms": fused_time,
                             "fused_butina_std_ms": fused_std,
                         }

From a02a45fc80a887693378ae01ab952f8556a56921 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Fri, 29 May 2026 17:35:53 -0400
Subject: [PATCH 2/3] butina: use full names for rdkit/nvmolkit timing vars;
 trim duplicate comments

---
 benchmarks/butina_clustering_bench.py | 56 +++++++++++++--------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py
index bccd617..9363efe 100644
--- a/benchmarks/butina_clustering_bench.py
+++ b/benchmarks/butina_clustering_bench.py
@@ -215,11 +215,8 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
 
     fps = get_fingerprints(mols)
 
-    # rdkit_with_tanimoto and rdkit_lowmem both require real RDKit fingerprints
-    # for every molecule up to the requested size. The precomputed-matrix rdkit
-    # path (rdkit_cluster_only) can still run on synthetic distances when the
-    # input has fewer mols than `size`, but the with_tanimoto / lowmem rows
-    # for that size are skipped in that case.
+    # All three rdkit paths (cluster_only, with_tanimoto, lowmem) need real
+    # RDKit fingerprints, so build them once if any rdkit row is planned.
     max_rdkit_fps_size = max(
         (e["size"] for e in run_plan if "rdkit" in e["run"] or "rdkit_lowmem" in e["run"]),
         default=0,
@@ -246,9 +243,8 @@ def save_results():
             runs = entry["run"]
             max_nl_sizes = entry.get("neighborlist_sizes", default_nl_sizes)
 
-            # rdkit_with_tanimoto and rdkit_lowmem need real fingerprints up to
-            # `size`. If the input has fewer mols, those rows are skipped for
-            # this size (cluster_only and fused still run on synthetic data).
+            # with_tanimoto and lowmem need real fingerprints for every mol up to
+            # `size`; when the input is smaller, those rows are skipped here.
             have_real_fps_for_size = rdkit_fps is not None and len(rdkit_fps) >= size
 
             need_real_fps_mat = "fused" in runs or "nvmolkit" in runs
@@ -284,15 +280,15 @@ def save_results():
                 if cutoff in (1e-10, 1.0) and size > 20000:
                     continue
 
-                rdkit_co_time, rdkit_co_std = float("nan"), float("nan")
-                rdkit_wt_time, rdkit_wt_std = float("nan"), float("nan")
+                rdkit_cluster_only_time, rdkit_cluster_only_std = float("nan"), float("nan")
+                rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = float("nan"), float("nan")
                 if "rdkit" in runs:
                     print(f"Running rdkit_cluster_only size {size} cutoff {cutoff}")
                     dist_mat_numpy = dist_mat.cpu().numpy()
-                    rdkit_co_time, rdkit_co_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
+                    rdkit_cluster_only_time, rdkit_cluster_only_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
                     if have_real_fps_for_size:
                         print(f"Running rdkit_with_tanimoto size {size} cutoff {cutoff}")
-                        rdkit_wt_time, rdkit_wt_std = bench_rdkit_with_tanimoto(
+                        rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = bench_rdkit_with_tanimoto(
                             rdkit_fps[:size], cutoff, runs=n_runs
                         )
 
@@ -314,24 +310,26 @@ def save_results():
                 if "nvmolkit" in runs:
                     for max_nl in max_nl_sizes:
                         print(f"Running nvmolkit_cluster_only size {size} cutoff {cutoff} max_nl {max_nl}")
-                        nvmol_co_result = time_it(
+                        nvmolkit_cluster_only_result = time_it(
                             lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl),
                             gpu_sync=True,
                             runs=n_runs,
                         )
-                        nvmol_co_time, nvmol_co_std = nvmol_co_result.mean_ms, nvmol_co_result.std_ms
+                        nvmolkit_cluster_only_time = nvmolkit_cluster_only_result.mean_ms
+                        nvmolkit_cluster_only_std = nvmolkit_cluster_only_result.std_ms
 
-                        nvmol_wt_time, nvmol_wt_std = float("nan"), float("nan")
+                        nvmolkit_with_tanimoto_time, nvmolkit_with_tanimoto_std = float("nan"), float("nan")
                         if fps_mat_real is not None:
                             print(
                                 f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}"
                             )
-                            nvmol_wt_result = time_it(
+                            nvmolkit_with_tanimoto_result = time_it(
                                 lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl),
                                 gpu_sync=True,
                                 runs=n_runs,
                             )
-                            nvmol_wt_time, nvmol_wt_std = nvmol_wt_result.mean_ms, nvmol_wt_result.std_ms
+                            nvmolkit_with_tanimoto_time = nvmolkit_with_tanimoto_result.mean_ms
+                            nvmolkit_with_tanimoto_std = nvmolkit_with_tanimoto_result.std_ms
 
                         nvmol_res = butina_nvmol(dist_mat, cutoff, neighborlist_max_size=max_nl).torch()
                         torch.cuda.synchronize()
@@ -346,16 +344,16 @@ def save_results():
                                 "size": size,
                                 "cutoff": cutoff,
                                 "max_neighborlist_size": max_nl,
-                                "rdkit_cluster_only_time_ms": rdkit_co_time,
-                                "rdkit_cluster_only_std_ms": rdkit_co_std,
-                                "rdkit_with_tanimoto_time_ms": rdkit_wt_time,
-                                "rdkit_with_tanimoto_std_ms": rdkit_wt_std,
+                                "rdkit_cluster_only_time_ms": rdkit_cluster_only_time,
+                                "rdkit_cluster_only_std_ms": rdkit_cluster_only_std,
+                                "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time,
+                                "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std,
                                 "rdkit_lowmem_time_ms": rdkit_lm_time,
                                 "rdkit_lowmem_std_ms": rdkit_lm_std,
-                                "nvmolkit_cluster_only_time_ms": nvmol_co_time,
-                                "nvmolkit_cluster_only_std_ms": nvmol_co_std,
-                                "nvmolkit_with_tanimoto_time_ms": nvmol_wt_time,
-                                "nvmolkit_with_tanimoto_std_ms": nvmol_wt_std,
+                                "nvmolkit_cluster_only_time_ms": nvmolkit_cluster_only_time,
+                                "nvmolkit_cluster_only_std_ms": nvmolkit_cluster_only_std,
+                                "nvmolkit_with_tanimoto_time_ms": nvmolkit_with_tanimoto_time,
+                                "nvmolkit_with_tanimoto_std_ms": nvmolkit_with_tanimoto_std,
                                 "fused_butina_time_ms": fused_time,
                                 "fused_butina_std_ms": fused_std,
                             }
@@ -366,10 +364,10 @@ def save_results():
                             "size": size,
                             "cutoff": cutoff,
                             "max_neighborlist_size": float("nan"),
-                            "rdkit_cluster_only_time_ms": rdkit_co_time,
-                            "rdkit_cluster_only_std_ms": rdkit_co_std,
-                            "rdkit_with_tanimoto_time_ms": rdkit_wt_time,
-                            "rdkit_with_tanimoto_std_ms": rdkit_wt_std,
+                            "rdkit_cluster_only_time_ms": rdkit_cluster_only_time,
+                            "rdkit_cluster_only_std_ms": rdkit_cluster_only_std,
+                            "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time,
+                            "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std,
                             "rdkit_lowmem_time_ms": rdkit_lm_time,
                             "rdkit_lowmem_std_ms": rdkit_lm_std,
                             "nvmolkit_cluster_only_time_ms": float("nan"),

From 258f7b08722ee93363ed3b8956d6940a5bc1dacd Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Mon, 1 Jun 2026 08:39:10 -0400
Subject: [PATCH 3/3] butina: collapse nvmolkit_with_tanimoto print onto one
 line

---
 benchmarks/butina_clustering_bench.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py
index 9363efe..dde235b 100644
--- a/benchmarks/butina_clustering_bench.py
+++ b/benchmarks/butina_clustering_bench.py
@@ -320,9 +320,7 @@ def save_results():
 
                         nvmolkit_with_tanimoto_time, nvmolkit_with_tanimoto_std = float("nan"), float("nan")
                         if fps_mat_real is not None:
-                            print(
-                                f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}"
-                            )
+                            print(f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}")
                             nvmolkit_with_tanimoto_result = time_it(
                                 lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl),
                                 gpu_sync=True,