From 5ed578c0abf34b5ae42e9b4473ad84a018fe242d Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Tue, 26 May 2026 13:58:14 -0400
Subject: [PATCH 1/5] Autotune: physical-core CPU count + categorical batch
 sizes, cap pools at 8

- cpu_count() now parses /proc/cpuinfo for unique (physical_id, core_id)
  pairs so SMT siblings don't double-count, falling back to os.cpu_count().
- Default FF / embed / substruct search spaces use a categorical list of
  multiples of 64 for batchSize (kernels are tile-tuned for those sizes).
- batchesPerGpu / workerThreads are now capped at min(8, cpus // num_gpus);
  8 is the empirical point of diminishing returns and the floor prevents
  CPU oversubscription across GPUs.
---
 nvmolkit/autotune/_ff_common.py           | 61 ++++++++++++++++++++---
 nvmolkit/autotune/tune_embed_molecules.py | 13 +++--
 nvmolkit/autotune/tune_substructure.py    | 12 +++--
 nvmolkit/tests/test_autotune.py           | 51 ++++++++++++++++---
 4 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/nvmolkit/autotune/_ff_common.py b/nvmolkit/autotune/_ff_common.py
index 0f174cee..6ebf116c 100644
--- a/nvmolkit/autotune/_ff_common.py
+++ b/nvmolkit/autotune/_ff_common.py
@@ -49,17 +49,59 @@ def coerce_gpu_ids(gpuIds: Optional[Iterable[int]]) -> list[int]:
 
 
 def cpu_count() -> int:
-    """Return the usable CPU core count, with a floor of 1."""
+    """Return the usable physical CPU core count, with a floor of 1.
+
+    Parses ``/proc/cpuinfo`` for unique ``(physical id, core id)`` pairs so
+    SMT siblings don't double-count. Falls back to ``os.cpu_count()`` (logical
+    cores) when ``/proc/cpuinfo`` is unavailable or unparseable; that fallback
+    overcounts on SMT hardware but never undercounts, and is only hit on
+    non-Linux systems we don't currently target.
+    """
+    physical = _physical_cpu_count_from_proc()
+    if physical is not None:
+        return max(1, physical)
     return max(1, os.cpu_count() or 1)
 
 
+def _physical_cpu_count_from_proc() -> Optional[int]:
+    """Return the number of distinct physical cores from ``/proc/cpuinfo``."""
+    try:
+        with open("/proc/cpuinfo", "r") as cpuinfo:
+            text = cpuinfo.read()
+    except OSError:
+        return None
+
+    pairs: set[tuple[str, str]] = set()
+    physical_id: Optional[str] = None
+    core_id: Optional[str] = None
+    for line in text.splitlines():
+        if not line.strip():
+            if physical_id is not None and core_id is not None:
+                pairs.add((physical_id, core_id))
+            physical_id = None
+            core_id = None
+            continue
+        if ":" not in line:
+            continue
+        key, value = line.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        if key == "physical id":
+            physical_id = value
+        elif key == "core id":
+            core_id = value
+    if physical_id is not None and core_id is not None:
+        pairs.add((physical_id, core_id))
+    return len(pairs) if pairs else None
+
+
 def resolve_cpu_budget(cpu_budget: Optional[int]) -> int:
     """Return the effective CPU budget for autotune search ranges.
 
     ``cpu_budget`` lets callers cap total CPU usage explicitly (useful for
     cross-machine normalization where the goal is to isolate GPU performance
-    from CPU-count differences). When ``None``, falls back to
-    :func:`cpu_count`. Values less than 1 are rejected.
+    from CPU-count differences). When ``None``, falls back to physical core
+    count via :func:`cpu_count`. Values less than 1 are rejected.
     """
     if cpu_budget is None:
         return cpu_count()
@@ -89,12 +131,15 @@ def resolve_num_gpus(fixed_gpu_ids: list[int]) -> int:
 def default_ff_search_space(num_gpus: int, cpus: int) -> dict:
     """Return the default FF search space scaled to the active hardware.
 
-    ``batchesPerGpu`` is per-GPU; its upper bound is capped at
-    ``cpus // num_gpus`` so the total worker thread count across all GPUs
-    (``num_gpus * batchesPerGpu``) does not exceed ``cpus``.
+    ``batchSize`` is restricted to multiples of 64 (categorical) since the
+    underlying kernels are tile-tuned for those sizes. ``batchesPerGpu`` is
+    per-GPU and capped at ``min(8, cpus // num_gpus)``: 8 is the empirical
+    point of diminishing returns for the batched-FF dispatch, and the
+    physical-core floor prevents oversubscribing CPU coordinator threads
+    across all GPUs.
     """
-    per_gpu_max = max(1, cpus // max(1, num_gpus))
+    per_gpu_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": (32, 1500, "log"),
+        "batchSize": [512, 64, 128, 192, 256, 320, 384, 448, 768, 1024],
         "batchesPerGpu": (1, per_gpu_max),
     }
diff --git a/nvmolkit/autotune/tune_embed_molecules.py b/nvmolkit/autotune/tune_embed_molecules.py
index 5e248a86..1149e731 100644
--- a/nvmolkit/autotune/tune_embed_molecules.py
+++ b/nvmolkit/autotune/tune_embed_molecules.py
@@ -45,13 +45,16 @@ def _default_embed_search_space(num_gpus: int, cpus: int) -> dict:
     EmbedMolecules runs preprocessing and GPU-dispatch threads sequentially,
     so each pool is capped independently:
 
-    * ``batchesPerGpu`` (per-GPU GPU-runner threads): max = ``cpus //
-      num_gpus``.
-    * ``preprocessingThreads`` (total CPU pool): max = ``cpus``.
+    * ``batchSize`` (categorical, multiples of 64): the embed kernels are
+      tile-tuned for these sizes.
+    * ``batchesPerGpu`` (per-GPU GPU-runner threads): max =
+      ``min(8, cpus // num_gpus)``. 8 is the empirical point of diminishing
+      returns; the physical-core floor prevents oversubscribing across GPUs.
+    * ``preprocessingThreads`` (total CPU pool): max = ``cpus`` (physical).
     """
-    per_gpu_max = max(1, cpus // max(1, num_gpus))
+    per_gpu_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": (64, 2000, "log"),
+        "batchSize": [512, 64, 128, 192, 256, 320, 384, 448, 768, 1024],
         "batchesPerGpu": (1, per_gpu_max),
         "preprocessingThreads": (1, cpus),
     }
diff --git a/nvmolkit/autotune/tune_substructure.py b/nvmolkit/autotune/tune_substructure.py
index f3162f4c..a08644cb 100644
--- a/nvmolkit/autotune/tune_substructure.py
+++ b/nvmolkit/autotune/tune_substructure.py
@@ -64,13 +64,17 @@ def _default_substruct_search_space(num_gpus: int, cpus: int) -> dict:
     at trial sampling time by clamping ``preprocessingThreads`` to whatever
     cores are left after ``workerThreads`` is chosen.
 
-    * ``workerThreads`` (per-GPU): max = ``cpus // num_gpus``.
-    * ``preprocessingThreads`` (total CPU pool): max = ``cpus`` (the
+    * ``batchSize`` (categorical, multiples of 64): the substructure kernels
+      are tile-tuned for these sizes.
+    * ``workerThreads`` (per-GPU): max = ``min(8, cpus // num_gpus)``.
+      8 is the empirical point of diminishing returns; the physical-core
+      floor prevents oversubscribing across GPUs.
+    * ``preprocessingThreads`` (total CPU pool): max = ``cpus`` (physical;
       effective per-trial cap is reduced once ``workerThreads`` is sampled).
     """
-    per_gpu_worker_max = max(1, cpus // max(1, num_gpus))
+    per_gpu_worker_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": (128, 8192, "log"),
+        "batchSize": [1024, 128, 256, 512, 768],
         "workerThreads": (1, per_gpu_worker_max),
         "preprocessingThreads": (1, cpus),
     }
diff --git a/nvmolkit/tests/test_autotune.py b/nvmolkit/tests/test_autotune.py
index 19cabf2c..6be9fcd4 100644
--- a/nvmolkit/tests/test_autotune.py
+++ b/nvmolkit/tests/test_autotune.py
@@ -256,23 +256,30 @@ def test_shrink_halves_within_floor():
 
 
 def test_default_ff_search_space_caps_batches_per_gpu_by_cpu_count():
-    """``batchesPerGpu`` upper bound divides the CPU budget by the GPU count."""
+    """``batchesPerGpu`` upper bound is ``min(8, cpus // num_gpus)``."""
     space_1gpu = _ff_common.default_ff_search_space(num_gpus=1, cpus=32)
     space_4gpu = _ff_common.default_ff_search_space(num_gpus=4, cpus=32)
     space_64gpu = _ff_common.default_ff_search_space(num_gpus=64, cpus=32)
 
-    assert space_1gpu["batchesPerGpu"] == (1, 32)
+    assert space_1gpu["batchesPerGpu"] == (1, 8)
     assert space_4gpu["batchesPerGpu"] == (1, 8)
     assert space_64gpu["batchesPerGpu"] == (1, 1)
 
 
+def test_default_ff_search_space_batch_size_is_categorical_multiples_of_64():
+    """``batchSize`` is a list of multiples of 64 (categorical)."""
+    space = _ff_common.default_ff_search_space(num_gpus=1, cpus=32)
+    assert isinstance(space["batchSize"], list)
+    assert all(isinstance(s, int) and s % 64 == 0 for s in space["batchSize"])
+
+
 def test_default_substruct_search_space_caps_per_pool():
-    """Substruct: workerThreads capped per-GPU, preprocessingThreads capped at cpus."""
+    """Substruct: workerThreads capped at min(8, cpus/num_gpus), prep at cpus."""
     space_1gpu = _default_substruct_search_space(num_gpus=1, cpus=16)
     space_4gpu = _default_substruct_search_space(num_gpus=4, cpus=16)
     space_64gpu = _default_substruct_search_space(num_gpus=64, cpus=16)
 
-    assert space_1gpu["workerThreads"] == (1, 16)
+    assert space_1gpu["workerThreads"] == (1, 8)
     assert space_4gpu["workerThreads"] == (1, 4)
     assert space_64gpu["workerThreads"] == (1, 1)
 
@@ -280,7 +287,8 @@ def test_default_substruct_search_space_caps_per_pool():
     assert space_4gpu["preprocessingThreads"] == (1, 16)
     assert space_64gpu["preprocessingThreads"] == (1, 16)
 
-    assert space_1gpu["batchSize"] == (128, 8192, "log")
+    assert isinstance(space_1gpu["batchSize"], list)
+    assert all(isinstance(s, int) and s % 64 == 0 for s in space_1gpu["batchSize"])
 
 
 class _RecordingTrial:
@@ -331,10 +339,18 @@ def test_suggest_preprocessing_threads_propagates_log_flag():
 
 
 def test_default_embed_search_space_caps_per_pool():
-    """Embed: per-GPU pool capped at cpus/numGpus, total pool capped at cpus."""
+    """Embed: per-GPU pool capped at min(8, cpus/numGpus), total prep pool at cpus."""
     space = _default_embed_search_space(num_gpus=4, cpus=16)
     assert space["batchesPerGpu"] == (1, 4)
     assert space["preprocessingThreads"] == (1, 16)
+    assert isinstance(space["batchSize"], list)
+    assert all(isinstance(s, int) and s % 64 == 0 for s in space["batchSize"])
+
+
+def test_default_embed_search_space_caps_batches_per_gpu_at_eight():
+    """When cpus/num_gpus exceeds 8, ``batchesPerGpu`` is still capped at 8."""
+    space = _default_embed_search_space(num_gpus=4, cpus=128)
+    assert space["batchesPerGpu"] == (1, 8)
 
 
 def test_resolve_num_gpus_prefers_explicit_list():
@@ -349,6 +365,29 @@ def test_resolve_cpu_budget_falls_back_to_cpu_count(monkeypatch):
     assert _ff_common.resolve_cpu_budget(None) == 24
 
 
+def test_physical_cpu_count_from_proc_dedupes_smt_siblings(tmp_path, monkeypatch):
+    """``_physical_cpu_count_from_proc`` collapses SMT siblings by ``(physical id, core id)``."""
+    fake_cpuinfo = (
+        "processor\t: 0\nphysical id\t: 0\ncore id\t: 0\n\n"
+        "processor\t: 1\nphysical id\t: 0\ncore id\t: 0\n\n"
+        "processor\t: 2\nphysical id\t: 0\ncore id\t: 1\n\n"
+        "processor\t: 3\nphysical id\t: 0\ncore id\t: 1\n\n"
+        "processor\t: 4\nphysical id\t: 1\ncore id\t: 0\n\n"
+        "processor\t: 5\nphysical id\t: 1\ncore id\t: 0\n\n"
+    )
+    fake_path = tmp_path / "cpuinfo"
+    fake_path.write_text(fake_cpuinfo)
+    real_open = open
+
+    def fake_open(path, *args, **kwargs):
+        if path == "/proc/cpuinfo":
+            return real_open(fake_path, *args, **kwargs)
+        return real_open(path, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.open", fake_open)
+    assert _ff_common._physical_cpu_count_from_proc() == 3
+
+
 def test_resolve_cpu_budget_uses_explicit_value():
     """An explicit ``cpu_budget`` overrides whatever the OS reports."""
     assert _ff_common.resolve_cpu_budget(14) == 14

From b9ba2f51c8b3e440c495822655953f6be5ba20bb Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Tue, 26 May 2026 14:38:44 -0400
Subject: [PATCH 2/5] Simplify CPU path

---
 nvmolkit/autotune/_ff_common.py | 37 +++++++--------------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/nvmolkit/autotune/_ff_common.py b/nvmolkit/autotune/_ff_common.py
index 6ebf116c..f11110ca 100644
--- a/nvmolkit/autotune/_ff_common.py
+++ b/nvmolkit/autotune/_ff_common.py
@@ -24,6 +24,7 @@
 from __future__ import annotations
 
 import os
+import re
 from typing import Iterable, Optional
 
 from rdkit.Chem import Mol
@@ -51,11 +52,7 @@ def coerce_gpu_ids(gpuIds: Optional[Iterable[int]]) -> list[int]:
 def cpu_count() -> int:
     """Return the usable physical CPU core count, with a floor of 1.
 
-    Parses ``/proc/cpuinfo`` for unique ``(physical id, core id)`` pairs so
-    SMT siblings don't double-count. Falls back to ``os.cpu_count()`` (logical
-    cores) when ``/proc/cpuinfo`` is unavailable or unparseable; that fallback
-    overcounts on SMT hardware but never undercounts, and is only hit on
-    non-Linux systems we don't currently target.
+    Falls back to ``os.cpu_count()`` (logical) if ``/proc/cpuinfo`` is missing.
     """
     physical = _physical_cpu_count_from_proc()
     if physical is not None:
@@ -66,33 +63,15 @@ def cpu_count() -> int:
 def _physical_cpu_count_from_proc() -> Optional[int]:
     """Return the number of distinct physical cores from ``/proc/cpuinfo``."""
     try:
-        with open("/proc/cpuinfo", "r") as cpuinfo:
+        with open("/proc/cpuinfo") as cpuinfo:
             text = cpuinfo.read()
     except OSError:
         return None
-
-    pairs: set[tuple[str, str]] = set()
-    physical_id: Optional[str] = None
-    core_id: Optional[str] = None
-    for line in text.splitlines():
-        if not line.strip():
-            if physical_id is not None and core_id is not None:
-                pairs.add((physical_id, core_id))
-            physical_id = None
-            core_id = None
-            continue
-        if ":" not in line:
-            continue
-        key, value = line.split(":", 1)
-        key = key.strip()
-        value = value.strip()
-        if key == "physical id":
-            physical_id = value
-        elif key == "core id":
-            core_id = value
-    if physical_id is not None and core_id is not None:
-        pairs.add((physical_id, core_id))
-    return len(pairs) if pairs else None
+    physical_ids = re.findall(r"^physical id\s*:\s*(\S+)", text, re.MULTILINE)
+    core_ids = re.findall(r"^core id\s*:\s*(\S+)", text, re.MULTILINE)
+    if not physical_ids or len(physical_ids) != len(core_ids):
+        return None
+    return len(set(zip(physical_ids, core_ids)))
 
 
 def resolve_cpu_budget(cpu_budget: Optional[int]) -> int:

From 90383ed72741288cd2bac047e9c130f714e64e18 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Tue, 26 May 2026 15:17:59 -0400
Subject: [PATCH 3/5] Further fixes for stepping

---
 nvmolkit/autotune/_core.py                | 25 +++++++++++++++--
 nvmolkit/autotune/_ff_common.py           | 15 +++++-----
 nvmolkit/autotune/tune_embed_molecules.py |  6 ++--
 nvmolkit/autotune/tune_substructure.py    |  6 ++--
 nvmolkit/tests/test_autotune.py           | 34 +++++++++++++----------
 5 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/nvmolkit/autotune/_core.py b/nvmolkit/autotune/_core.py
index b990e93c..41eb45a8 100644
--- a/nvmolkit/autotune/_core.py
+++ b/nvmolkit/autotune/_core.py
@@ -287,6 +287,9 @@ def suggest_from_space(trial, name: str, spec: Any) -> Any:
 
     - ``(low, high)`` tuple — uniform integer range.
     - ``(low, high, "log")`` tuple — log-uniform integer range.
+    - ``(low, high, step)`` tuple — uniform integer range restricted to
+      multiples of ``step`` (preserves ordering for TPE, unlike a categorical
+      list of the same values).
     - ``list`` of choices — categorical search.
 
     A literal scalar is returned unchanged (acts as a fixed value rather than
@@ -299,6 +302,9 @@ def suggest_from_space(trial, name: str, spec: Any) -> Any:
     if isinstance(spec, tuple) and len(spec) == 3 and spec[2] == "log":
         low, high, _ = spec
         return trial.suggest_int(name, int(low), int(high), log=True)
+    if isinstance(spec, tuple) and len(spec) == 3 and all(isinstance(v, int) for v in spec):
+        low, high, step = spec
+        return trial.suggest_int(name, int(low), int(high), step=int(step))
     if isinstance(spec, list) and len(spec) == 2 and all(isinstance(v, int) for v in spec):
         raise TypeError(
             f"Search-space override for {name!r} is a 2-element int list {spec!r}; "
@@ -310,6 +316,12 @@ def suggest_from_space(trial, name: str, spec: Any) -> Any:
             f"Search-space override for {name!r} is a 3-element list {spec!r} ending in 'log'; "
             "use a tuple (low, high, 'log') for a log-uniform integer range."
         )
+    if isinstance(spec, list) and len(spec) == 3 and all(isinstance(v, int) for v in spec):
+        raise TypeError(
+            f"Search-space override for {name!r} is a 3-element int list {spec!r}; "
+            "use a tuple (low, high, step) for a stepped integer range, or wrap in "
+            "a list of more than three values to request a categorical search."
+        )
     if isinstance(spec, (list, tuple)):
         choices = list(spec)
         return trial.suggest_categorical(name, choices)
@@ -322,9 +334,11 @@ def collect_int_from_space(spec: Any) -> int:
     For a uniform integer range ``(low, high)`` the arithmetic midpoint is
     returned. For a log-uniform range ``(low, high, "log")`` the geometric
     midpoint is returned (rounded to the nearest int, clamped to ``[low,
-    high]``). For a categorical list the first listed choice is returned, on
-    the convention that callers list their preferred default first. A bare
-    scalar is returned unchanged.
+    high]``). For a stepped range ``(low, high, step)`` the arithmetic
+    midpoint snapped to the nearest multiple of ``step`` from ``low`` is
+    returned, clamped to ``[low, high]``. For a categorical list the first
+    listed choice is returned, on the convention that callers list their
+    preferred default first. A bare scalar is returned unchanged.
     """
     if isinstance(spec, tuple) and len(spec) == 3 and spec[2] == "log":
         low, high, _ = spec
@@ -334,6 +348,11 @@ def collect_int_from_space(spec: Any) -> int:
             raise ValueError(f"Log-uniform range {spec!r} requires strictly positive bounds.")
         midpoint = int(round(math.sqrt(low_int * high_int)))
         return max(low_int, min(high_int, midpoint))
+    if isinstance(spec, tuple) and len(spec) == 3 and all(isinstance(v, int) for v in spec):
+        low, high, step = (int(v) for v in spec)
+        midpoint = (low + high) // 2
+        snapped = low + round((midpoint - low) / step) * step
+        return max(low, min(high, snapped))
     if isinstance(spec, tuple) and len(spec) == 2:
         low, high = spec
         return (int(low) + int(high)) // 2
diff --git a/nvmolkit/autotune/_ff_common.py b/nvmolkit/autotune/_ff_common.py
index f11110ca..c37845fb 100644
--- a/nvmolkit/autotune/_ff_common.py
+++ b/nvmolkit/autotune/_ff_common.py
@@ -110,15 +110,16 @@ def resolve_num_gpus(fixed_gpu_ids: list[int]) -> int:
 def default_ff_search_space(num_gpus: int, cpus: int) -> dict:
     """Return the default FF search space scaled to the active hardware.
 
-    ``batchSize`` is restricted to multiples of 64 (categorical) since the
-    underlying kernels are tile-tuned for those sizes. ``batchesPerGpu`` is
-    per-GPU and capped at ``min(8, cpus // num_gpus)``: 8 is the empirical
-    point of diminishing returns for the batched-FF dispatch, and the
-    physical-core floor prevents oversubscribing CPU coordinator threads
-    across all GPUs.
+    ``batchSize`` is a stepped integer range in multiples of 64 since the
+    underlying kernels are tile-tuned for those sizes; the stepped form
+    preserves numeric ordering for TPE (unlike a categorical list).
+    ``batchesPerGpu`` is per-GPU and capped at ``min(8, cpus // num_gpus)``:
+    8 is the empirical point of diminishing returns for the batched-FF
+    dispatch, and the physical-core floor prevents oversubscribing CPU
+    coordinator threads across all GPUs.
     """
     per_gpu_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": [512, 64, 128, 192, 256, 320, 384, 448, 768, 1024],
+        "batchSize": (64, 1024, 64),
         "batchesPerGpu": (1, per_gpu_max),
     }
diff --git a/nvmolkit/autotune/tune_embed_molecules.py b/nvmolkit/autotune/tune_embed_molecules.py
index 1149e731..437f8649 100644
--- a/nvmolkit/autotune/tune_embed_molecules.py
+++ b/nvmolkit/autotune/tune_embed_molecules.py
@@ -45,8 +45,8 @@ def _default_embed_search_space(num_gpus: int, cpus: int) -> dict:
     EmbedMolecules runs preprocessing and GPU-dispatch threads sequentially,
     so each pool is capped independently:
 
-    * ``batchSize`` (categorical, multiples of 64): the embed kernels are
-      tile-tuned for these sizes.
+    * ``batchSize``: stepped int range in multiples of 64 (kernels are
+      tile-tuned for these sizes); stepping preserves numeric ordering for TPE.
     * ``batchesPerGpu`` (per-GPU GPU-runner threads): max =
       ``min(8, cpus // num_gpus)``. 8 is the empirical point of diminishing
       returns; the physical-core floor prevents oversubscribing across GPUs.
@@ -54,7 +54,7 @@ def _default_embed_search_space(num_gpus: int, cpus: int) -> dict:
     """
     per_gpu_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": [512, 64, 128, 192, 256, 320, 384, 448, 768, 1024],
+        "batchSize": (64, 1024, 64),
         "batchesPerGpu": (1, per_gpu_max),
         "preprocessingThreads": (1, cpus),
     }
diff --git a/nvmolkit/autotune/tune_substructure.py b/nvmolkit/autotune/tune_substructure.py
index a08644cb..49418688 100644
--- a/nvmolkit/autotune/tune_substructure.py
+++ b/nvmolkit/autotune/tune_substructure.py
@@ -64,8 +64,8 @@ def _default_substruct_search_space(num_gpus: int, cpus: int) -> dict:
     at trial sampling time by clamping ``preprocessingThreads`` to whatever
     cores are left after ``workerThreads`` is chosen.
 
-    * ``batchSize`` (categorical, multiples of 64): the substructure kernels
-      are tile-tuned for these sizes.
+    * ``batchSize``: stepped int range in multiples of 128 (kernels are
+      tile-tuned for these sizes); stepping preserves numeric ordering for TPE.
     * ``workerThreads`` (per-GPU): max = ``min(8, cpus // num_gpus)``.
       8 is the empirical point of diminishing returns; the physical-core
       floor prevents oversubscribing across GPUs.
@@ -74,7 +74,7 @@ def _default_substruct_search_space(num_gpus: int, cpus: int) -> dict:
     """
     per_gpu_worker_max = max(1, min(8, cpus // max(1, num_gpus)))
     return {
-        "batchSize": [1024, 128, 256, 512, 768],
+        "batchSize": (128, 1024, 128),
         "workerThreads": (1, per_gpu_worker_max),
         "preprocessingThreads": (1, cpus),
     }
diff --git a/nvmolkit/tests/test_autotune.py b/nvmolkit/tests/test_autotune.py
index 6be9fcd4..28f322c8 100644
--- a/nvmolkit/tests/test_autotune.py
+++ b/nvmolkit/tests/test_autotune.py
@@ -266,11 +266,13 @@ def test_default_ff_search_space_caps_batches_per_gpu_by_cpu_count():
     assert space_64gpu["batchesPerGpu"] == (1, 1)
 
 
-def test_default_ff_search_space_batch_size_is_categorical_multiples_of_64():
-    """``batchSize`` is a list of multiples of 64 (categorical)."""
+def test_default_ff_search_space_batch_size_is_stepped_multiples_of_64():
+    """``batchSize`` is a stepped int range in multiples of 64."""
     space = _ff_common.default_ff_search_space(num_gpus=1, cpus=32)
-    assert isinstance(space["batchSize"], list)
-    assert all(isinstance(s, int) and s % 64 == 0 for s in space["batchSize"])
+    low, high, step = space["batchSize"]
+    assert step == 64
+    assert low % 64 == 0 and high % 64 == 0
+    assert low <= high
 
 
 def test_default_substruct_search_space_caps_per_pool():
@@ -287,8 +289,10 @@ def test_default_substruct_search_space_caps_per_pool():
     assert space_4gpu["preprocessingThreads"] == (1, 16)
     assert space_64gpu["preprocessingThreads"] == (1, 16)
 
-    assert isinstance(space_1gpu["batchSize"], list)
-    assert all(isinstance(s, int) and s % 64 == 0 for s in space_1gpu["batchSize"])
+    low, high, step = space_1gpu["batchSize"]
+    assert step % 64 == 0
+    assert low % step == 0 and high % step == 0
+    assert low <= high
 
 
 class _RecordingTrial:
@@ -298,8 +302,8 @@ def __init__(self, picker=lambda low, high: low):
         self.calls: list[dict] = []
         self.picker = picker
 
-    def suggest_int(self, name: str, low: int, high: int, log: bool = False) -> int:
-        self.calls.append({"name": name, "low": low, "high": high, "log": log})
+    def suggest_int(self, name: str, low: int, high: int, log: bool = False, step: int = 1) -> int:
+        self.calls.append({"name": name, "low": low, "high": high, "log": log, "step": step})
         return int(self.picker(low, high))
 
 
@@ -308,7 +312,7 @@ def test_suggest_preprocessing_threads_clamps_to_remaining_cpu_budget():
     trial = _RecordingTrial()
     spec = (1, 32)
     value = _suggest_preprocessing_threads(trial, spec, worker_threads=6, num_gpus=2, cpus=16)
-    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 4, "log": False}]
+    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 4, "log": False, "step": 1}]
     assert value == 1
 
 
@@ -317,7 +321,7 @@ def test_suggest_preprocessing_threads_respects_low_floor_when_budget_exhausted(
     trial = _RecordingTrial(picker=lambda low, high: high)
     spec = (4, 32)
     value = _suggest_preprocessing_threads(trial, spec, worker_threads=8, num_gpus=2, cpus=16)
-    assert trial.calls == [{"name": "preprocessingThreads", "low": 4, "high": 4, "log": False}]
+    assert trial.calls == [{"name": "preprocessingThreads", "low": 4, "high": 4, "log": False, "step": 1}]
     assert value == 4
 
 
@@ -326,7 +330,7 @@ def test_suggest_preprocessing_threads_does_not_clamp_when_budget_available():
     trial = _RecordingTrial(picker=lambda low, high: high)
     spec = (1, 8)
     value = _suggest_preprocessing_threads(trial, spec, worker_threads=2, num_gpus=2, cpus=32)
-    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 8, "log": False}]
+    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 8, "log": False, "step": 1}]
     assert value == 8
 
 
@@ -335,7 +339,7 @@ def test_suggest_preprocessing_threads_propagates_log_flag():
     trial = _RecordingTrial()
     spec = (1, 32, "log")
     _suggest_preprocessing_threads(trial, spec, worker_threads=2, num_gpus=1, cpus=8)
-    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 6, "log": True}]
+    assert trial.calls == [{"name": "preprocessingThreads", "low": 1, "high": 6, "log": True, "step": 1}]
 
 
 def test_default_embed_search_space_caps_per_pool():
@@ -343,8 +347,10 @@ def test_default_embed_search_space_caps_per_pool():
     space = _default_embed_search_space(num_gpus=4, cpus=16)
     assert space["batchesPerGpu"] == (1, 4)
     assert space["preprocessingThreads"] == (1, 16)
-    assert isinstance(space["batchSize"], list)
-    assert all(isinstance(s, int) and s % 64 == 0 for s in space["batchSize"])
+    low, high, step = space["batchSize"]
+    assert step == 64
+    assert low % 64 == 0 and high % 64 == 0
+    assert low <= high
 
 
 def test_default_embed_search_space_caps_batches_per_gpu_at_eight():

From 8cd7e8d513c730c97ba8057781e93e8e34d2f8d8 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kevin44boyd@gmail.com>
Date: Tue, 26 May 2026 15:37:15 -0400
Subject: [PATCH 4/5] Update nvmolkit/autotune/_core.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 nvmolkit/autotune/_core.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nvmolkit/autotune/_core.py b/nvmolkit/autotune/_core.py
index 41eb45a8..36eaf9fd 100644
--- a/nvmolkit/autotune/_core.py
+++ b/nvmolkit/autotune/_core.py
@@ -304,6 +304,10 @@ def suggest_from_space(trial, name: str, spec: Any) -> Any:
         return trial.suggest_int(name, int(low), int(high), log=True)
     if isinstance(spec, tuple) and len(spec) == 3 and all(isinstance(v, int) for v in spec):
         low, high, step = spec
+        if step <= 0:
+            raise ValueError(
+                f"Search-space override for {name!r}: step must be a positive integer, got {step!r}."
+            )
         return trial.suggest_int(name, int(low), int(high), step=int(step))
     if isinstance(spec, list) and len(spec) == 2 and all(isinstance(v, int) for v in spec):
         raise TypeError(

From 4b1c4b1267a7e75ae886f74b0ff135132a65e2ae Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Wed, 27 May 2026 08:39:56 -0400
Subject: [PATCH 5/5] formatting

---
 nvmolkit/autotune/_core.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nvmolkit/autotune/_core.py b/nvmolkit/autotune/_core.py
index 36eaf9fd..fa8a9671 100644
--- a/nvmolkit/autotune/_core.py
+++ b/nvmolkit/autotune/_core.py
@@ -305,9 +305,7 @@ def suggest_from_space(trial, name: str, spec: Any) -> Any:
     if isinstance(spec, tuple) and len(spec) == 3 and all(isinstance(v, int) for v in spec):
         low, high, step = spec
         if step <= 0:
-            raise ValueError(
-                f"Search-space override for {name!r}: step must be a positive integer, got {step!r}."
-            )
+            raise ValueError(f"Search-space override for {name!r}: step must be a positive integer, got {step!r}.")
         return trial.suggest_int(name, int(low), int(high), step=int(step))
     if isinstance(spec, list) and len(spec) == 2 and all(isinstance(v, int) for v in spec):
         raise TypeError(