diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py
index f1c5c4e..dde235b 100644
--- a/benchmarks/butina_clustering_bench.py
+++ b/benchmarks/butina_clustering_bench.py
@@ -150,11 +150,14 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
     parser.add_argument("--no-rdkit", action="store_true", help="Disable RDKit benchmarks")
     parser.add_argument("--no-fused", action="store_true", help="Disable fused Butina benchmarks")
     parser.add_argument("--no-nvmolkit", action="store_true", help="Disable nvMolKit Butina benchmarks")
-    parser.add_argument("--no-rdkit-lowmem", action="store_true", help="Disable RDKit low-memory benchmarks")
     parser.add_argument(
-        "--include-tanimoto-matrix",
+        "--rdkit-lowmem",
         action="store_true",
-        help="Include tanimoto matrix calculation in butina timing (for fair comparison with fused_butina)",
+        help=(
+            "Enable the RDKit low-memory backend. Off by default because its "
+            "distance-matrix builder is a pure-Python O(n^2) loop that does "
+            "not finish in reasonable wall time at sizes >= 40k."
+        ),
     )
     parser.add_argument(
         "--config",
@@ -170,7 +173,6 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
     )
     args = parser.parse_args()
 
-    include_tanimoto = args.include_tanimoto_matrix
     n_runs = args.runs
     disabled = set()
     if args.no_rdkit:
@@ -179,7 +181,7 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
         disabled.add("fused")
     if args.no_nvmolkit:
         disabled.add("nvmolkit")
-    if args.no_rdkit_lowmem:
+    if not args.rdkit_lowmem:
         disabled.add("rdkit_lowmem")
 
     if args.config:
@@ -211,35 +213,22 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size):
 
     mols = load_smiles(args.input_smiles_file, max_count=max_size + 100, sanitize=True, seed=args.seed)
 
-    if include_tanimoto and len(mols) < max_size:
-        print(
-            f"Error: --include-tanimoto-matrix requires at least {max_size} molecules, "
-            f"but only {len(mols)} were loaded from input",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
     fps = get_fingerprints(mols)
 
+    # All three rdkit paths (cluster_only, with_tanimoto, lowmem) need real
+    # RDKit fingerprints, so build them once if any rdkit row is planned.
     max_rdkit_fps_size = max(
-        (e["size"] for e in run_plan if "rdkit_lowmem" in e["run"] or ("rdkit" in e["run"] and include_tanimoto)),
+        (e["size"] for e in run_plan if "rdkit" in e["run"] or "rdkit_lowmem" in e["run"]),
         default=0,
     )
     if max_rdkit_fps_size > 0:
-        if len(mols) < max_rdkit_fps_size:
-            print(
-                f"Error: rdkit benchmarks with fingerprints require at least {max_rdkit_fps_size} "
-                f"molecules, but only {len(mols)} were loaded from input",
-                file=sys.stderr,
-            )
-            sys.exit(1)
         rdkit_fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024)
         rdkit_fps = [rdkit_fpgen.GetFingerprint(mol) for mol in mols]
     else:
         rdkit_fps = None
 
     output_path = args.output
-    cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.1]
+    cutoffs = [args.cutoff] if args.cutoff is not None else [1e-10, 0.1, 0.2, 0.35, 1.0]
     default_nl_sizes = [8, 16, 32, 64, 128]
     results = []
 
@@ -254,34 +243,57 @@ def save_results():
             runs = entry["run"]
             max_nl_sizes = entry.get("neighborlist_sizes", default_nl_sizes)
 
-            need_dist = "nvmolkit" in runs or ("rdkit" in runs and not include_tanimoto)
-            if include_tanimoto:
-                fps_mat = fps[:size].contiguous()
-                if need_dist:
-                    dist_mat = 1.0 - crossTanimotoSimilarity(fps_mat).torch()
+            # with_tanimoto and lowmem need real fingerprints for every mol up to
+            # `size`; when the input is smaller, those rows are skipped here.
+            have_real_fps_for_size = rdkit_fps is not None and len(rdkit_fps) >= size
+
+            need_real_fps_mat = "fused" in runs or "nvmolkit" in runs
+            if need_real_fps_mat and len(mols) >= size:
+                fps_mat_real = fps[:size].contiguous()
+            else:
+                fps_mat_real = None
+            if need_real_fps_mat and fps_mat_real is None:
+                fps_mat_synth = resize_and_fill_fingerprints(fps, size)
             else:
-                fps_mat = resize_and_fill_fingerprints(fps, size)
-                if need_dist:
-                    real_size = min(size, len(mols))
+                fps_mat_synth = None
+            fps_mat = fps_mat_real if fps_mat_real is not None else fps_mat_synth
+
+            need_dist = "nvmolkit" in runs or "rdkit" in runs
+            if need_dist:
+                real_size = min(size, len(mols))
+                if "nvmolkit" in runs or "fused" in runs:
                     base_dists = 1.0 - crossTanimotoSimilarity(fps[:real_size]).torch()
+                else:
+                    rdkit_dist = np.empty((real_size, real_size), dtype=np.float64)
+                    for i in range(real_size):
+                        rdkit_dist[i] = BulkTanimotoSimilarity(rdkit_fps[i], rdkit_fps[:real_size])
+                    np.subtract(1.0, rdkit_dist, out=rdkit_dist)
+                    base_dists = torch.from_numpy(rdkit_dist)
+                if real_size >= size:
+                    dist_mat = base_dists.contiguous()
+                else:
                     dist_mat = resize_and_fill(base_dists, size)
                     del base_dists
 
             for cutoff in cutoffs:
                 # Don't run large sizes for edge cases.
-                if cutoff in (1e-10, 1.1) and size > 20000:
+                if cutoff in (1e-10, 1.0) and size > 20000:
                     continue
 
-                rdkit_time, rdk_std = float("nan"), float("nan")
+                rdkit_cluster_only_time, rdkit_cluster_only_std = float("nan"), float("nan")
+                rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = float("nan"), float("nan")
                 if "rdkit" in runs:
-                    if include_tanimoto:
-                        rdkit_time, rdk_std = bench_rdkit_with_tanimoto(rdkit_fps[:size], cutoff, runs=n_runs)
-                    else:
-                        dist_mat_numpy = dist_mat.cpu().numpy()
-                        rdkit_time, rdk_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
+                    print(f"Running rdkit_cluster_only size {size} cutoff {cutoff}")
+                    dist_mat_numpy = dist_mat.cpu().numpy()
+                    rdkit_cluster_only_time, rdkit_cluster_only_std = bench_rdkit(dist_mat_numpy, cutoff, runs=n_runs)
+                    if have_real_fps_for_size:
+                        print(f"Running rdkit_with_tanimoto size {size} cutoff {cutoff}")
+                        rdkit_with_tanimoto_time, rdkit_with_tanimoto_std = bench_rdkit_with_tanimoto(
+                            rdkit_fps[:size], cutoff, runs=n_runs
+                        )
 
                 rdkit_lm_time, rdkit_lm_std = float("nan"), float("nan")
-                if "rdkit_lowmem" in runs:
+                if "rdkit_lowmem" in runs and have_real_fps_for_size:
                     print(f"Running rdkit_lowmem size {size} cutoff {cutoff}")
                     rdkit_lm_time, rdkit_lm_std = bench_rdkit_lowmem(rdkit_fps[:size], cutoff, runs=n_runs)
 
@@ -297,20 +309,25 @@ def save_results():
 
                 if "nvmolkit" in runs:
                     for max_nl in max_nl_sizes:
-                        print(f"Running nvmolkit size {size} cutoff {cutoff} max_nl {max_nl}")
-                        if include_tanimoto:
-                            nvmol_result = time_it(
-                                lambda: bench_nvmol_with_tanimoto(fps_mat, cutoff, max_nl),
-                                gpu_sync=True,
-                                runs=n_runs,
-                            )
-                        else:
-                            nvmol_result = time_it(
-                                lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl),
+                        print(f"Running nvmolkit_cluster_only size {size} cutoff {cutoff} max_nl {max_nl}")
+                        nvmolkit_cluster_only_result = time_it(
+                            lambda: bench_nvmol_inner(dist_mat, cutoff, max_nl),
+                            gpu_sync=True,
+                            runs=n_runs,
+                        )
+                        nvmolkit_cluster_only_time = nvmolkit_cluster_only_result.mean_ms
+                        nvmolkit_cluster_only_std = nvmolkit_cluster_only_result.std_ms
+
+                        nvmolkit_with_tanimoto_time, nvmolkit_with_tanimoto_std = float("nan"), float("nan")
+                        if fps_mat_real is not None:
+                            print(f"Running nvmolkit_with_tanimoto size {size} cutoff {cutoff} max_nl {max_nl}")
+                            nvmolkit_with_tanimoto_result = time_it(
+                                lambda: bench_nvmol_with_tanimoto(fps_mat_real, cutoff, max_nl),
                                 gpu_sync=True,
                                 runs=n_runs,
                             )
-                        nvmol_time, nvmol_std = nvmol_result.mean_ms, nvmol_result.std_ms
+                            nvmolkit_with_tanimoto_time = nvmolkit_with_tanimoto_result.mean_ms
+                            nvmolkit_with_tanimoto_std = nvmolkit_with_tanimoto_result.std_ms
 
                         nvmol_res = butina_nvmol(dist_mat, cutoff, neighborlist_max_size=max_nl).torch()
                         torch.cuda.synchronize()
@@ -325,12 +342,16 @@ def save_results():
                                 "size": size,
                                 "cutoff": cutoff,
                                 "max_neighborlist_size": max_nl,
-                                "rdkit_time_ms": rdkit_time,
-                                "rdkit_std_ms": rdk_std,
+                                "rdkit_cluster_only_time_ms": rdkit_cluster_only_time,
+                                "rdkit_cluster_only_std_ms": rdkit_cluster_only_std,
+                                "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time,
+                                "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std,
                                 "rdkit_lowmem_time_ms": rdkit_lm_time,
                                 "rdkit_lowmem_std_ms": rdkit_lm_std,
-                                "nvmol_time_ms": nvmol_time,
-                                "nvmol_std_ms": nvmol_std,
+                                "nvmolkit_cluster_only_time_ms": nvmolkit_cluster_only_time,
+                                "nvmolkit_cluster_only_std_ms": nvmolkit_cluster_only_std,
+                                "nvmolkit_with_tanimoto_time_ms": nvmolkit_with_tanimoto_time,
+                                "nvmolkit_with_tanimoto_std_ms": nvmolkit_with_tanimoto_std,
                                 "fused_butina_time_ms": fused_time,
                                 "fused_butina_std_ms": fused_std,
                             }
@@ -341,12 +362,16 @@ def save_results():
                             "size": size,
                             "cutoff": cutoff,
                             "max_neighborlist_size": float("nan"),
-                            "rdkit_time_ms": rdkit_time,
-                            "rdkit_std_ms": rdk_std,
+                            "rdkit_cluster_only_time_ms": rdkit_cluster_only_time,
+                            "rdkit_cluster_only_std_ms": rdkit_cluster_only_std,
+                            "rdkit_with_tanimoto_time_ms": rdkit_with_tanimoto_time,
+                            "rdkit_with_tanimoto_std_ms": rdkit_with_tanimoto_std,
                             "rdkit_lowmem_time_ms": rdkit_lm_time,
                             "rdkit_lowmem_std_ms": rdkit_lm_std,
-                            "nvmol_time_ms": float("nan"),
-                            "nvmol_std_ms": float("nan"),
+                            "nvmolkit_cluster_only_time_ms": float("nan"),
+                            "nvmolkit_cluster_only_std_ms": float("nan"),
+                            "nvmolkit_with_tanimoto_time_ms": float("nan"),
+                            "nvmolkit_with_tanimoto_std_ms": float("nan"),
                             "fused_butina_time_ms": fused_time,
                             "fused_butina_std_ms": fused_std,
                         }