From a64923fcfabe2a230b5fa4c3bda8ef988d330514 Mon Sep 17 00:00:00 2001
From: sam-obeid <sam.obeid@shopify.com>
Date: Mon, 4 May 2026 11:45:24 -0400
Subject: [PATCH] Public-release hardening: drop neocache, reproducibility & CI
 fixes

Single squashed commit covering everything needed to make a fresh
public clone build, lint, test and release-build green from a CI
runner with no Shopify git credentials.

Drop the neocache benchmark from the public repo
  - Remove the `Shopify/neocache` git dependency from Cargo.toml and
    its package stanza + cache-bench dep entry from Cargo.lock.
  - Delete src/caches/neocache.rs.
  - Remove neocache from ALL_CACHES and the dispatch arm in
    src/main.rs, and from the AVAILABLE CACHES / Available caches
    blocks in --help and --info output.
  - Replace the [neocache] row in the README comparison table with
    a 'Note on neocache' explainer documenting why it is absent and
    how to re-add it on a private overlay branch.

  Rationale: cargo always fetches git sources referenced by
  Cargo.lock, even for `optional = true` deps with the feature
  disabled, so feature-gating cannot fix the auth failure on public
  CI runners. Verified empirically: `cargo clippy --all-targets
  --locked -- -D warnings` now passes from a fresh CARGO_HOME with
  no git creds (the exact public-CI scenario), where it previously
  failed with 'failed to authenticate when downloading repository'.

CI workflow + supply-chain hardening
  - .github/workflows/ci.yml: pin all third-party actions to 40-char
    SHAs (with version comment for Dependabot), lock GITHUB_TOKEN to
    `contents: read`, build the cargo cache key from OS+toolchain+
    Cargo.lock with a toolchain-scoped restore-key, fail-fast off,
    and add a `cargo audit --deny warnings` job with a cached
    cargo-audit binary.
  - .cargo/audit.toml: explicitly-accepted advisories with rationale
    and review dates; new advisories fail the build by default.
  - .github/CODEOWNERS: ownership for review routing.
  - Cargo.toml: `rust-version = "1.94"` MSRV, `publish = false`
    (the `cache-bench` name is unclaimed on crates.io and we have
    no intent to claim it), readme/keywords/categories metadata.

Reproducibility plumbing
  - generate_value_pool now takes the base RNG seed and derives a
    deterministic StdRng from it (with a 0xC0FFEE offset to avoid
    colliding with per-task seeds), so re-running with the same
    --seed produces bit-identical value bytes.
  - generate_fixed_value takes the RNG by reference instead of using
    an unseeded thread RNG.
  - calculate_statistics: f64::total_cmp instead of partial_cmp so a
    stray NaN sorts deterministically rather than panicking.
  - BenchConfig::cold_start docs spell out that it is forced `true`
    during the calibration pass.
  - Document the fresh-key window wraparound for
    write_new_key_fraction > 0 in total_benchmark_keys.

CLI polish
  - ANSI colour escapes are now emitted only when stdout is a TTY
    (cached behind OnceLock<bool>); piping benchmark output to a
    file or CI log no longer produces literal \x1b[...m sequences.

Docs
  - README: refreshed benchmark-output screenshot URL (kept the new
    user-attachments image from #6 over the in-repo placeholder
    that referenced a not-yet-committed file).
  - CONTRIBUTING.md: clarifications.
  - docs/assets/README.md: contributor guidance scaffolding.

Verification (local, all green):
  - cargo fmt --check
  - cargo clippy --all-targets --locked -- -D warnings
    (also from a fresh empty CARGO_HOME with no git creds)
  - cargo build --locked
  - cargo build --release --locked
  - cargo test --locked  (20 passed)
---
 .cargo/audit.toml        |  29 +++++
 .github/CODEOWNERS       |  27 +++++
 .github/workflows/ci.yml |  48 +++++---
 CONTRIBUTING.md          |   9 +-
 Cargo.lock               |  13 ---
 Cargo.toml               |  23 +++-
 README.md                | 136 +++++++++++++++-------
 docs/assets/README.md    |  26 +++++
 src/caches/minimoka.rs   |   8 +-
 src/caches/mod.rs        |  71 ++++++++++--
 src/caches/neocache.rs   | 157 --------------------------
 src/main.rs              | 235 +++++++++++++++++++++++++++------------
 12 files changed, 471 insertions(+), 311 deletions(-)
 create mode 100644 .cargo/audit.toml
 create mode 100644 .github/CODEOWNERS
 create mode 100644 docs/assets/README.md
 delete mode 100644 src/caches/neocache.rs
diff --git a/.cargo/audit.toml b/.cargo/audit.toml
new file mode 100644
index 0000000..f8ef703
--- /dev/null
+++ b/.cargo/audit.toml
@@ -0,0 +1,29 @@
+# cargo-audit configuration.
+#
+# CI runs `cargo audit --deny warnings` so any *new* advisory (vulnerability,
+# unsoundness, or unmaintained crate) fails the build. The entries below are
+# the explicitly-accepted exceptions: each one must have a tracking note and
+# should be revisited on every cargo bump.
+#
+# When adding an entry: include the advisory ID, a one-line reason, and the
+# date it was accepted. When removing: confirm the upstream fix shipped and
+# `cargo audit` returns clean without the ignore.
+
+[advisories]
+ignore = [
+    # RUSTSEC-2024-0436 — `paste` 1.0.15 is unmaintained.
+    # Pulled in transitively via foyer 0.22 -> foyer-memory -> paste. No direct
+    # use; only its proc-macro is invoked at build time. No exposed API surface
+    # at runtime. Re-evaluate when foyer drops paste or migrates to pastey.
+    # Accepted: 2026-05.
+    "RUSTSEC-2024-0436",
+
+    # RUSTSEC-2026-0097 — `rand` is unsound when a custom global logger calls
+    # `rand::rng()` from inside its log handler. We affect three rand versions
+    # (0.8.5 via stretto/caches, 0.9.2 via foyer/twox-hash, 0.10.0 direct).
+    # This benchmark does not install a custom `log` handler and does not call
+    # `rand::rng()` from any logger path, so the unsoundness condition cannot
+    # be triggered here. Re-evaluate when transitive deps move to a fixed rand.
+    # Accepted: 2026-05.
+    "RUSTSEC-2026-0097",
+]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..40d01a3
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,27 @@
+# CODEOWNERS for rust-cache-benchmarks
+#
+# GitHub auto-assigns the listed teams as reviewers on every PR (including
+# Dependabot bumps). Order matters: the *last* matching pattern wins.
+#
+# All paths below are owned by @Shopify/rust-cache-benchmarks-maintainers.
+# If that team is renamed, retired, or split, update every line in this file
+# in the same commit — GitHub silently no-ops entries that point at unknown
+# teams, which would leave PRs without a required reviewer.
+
+# Default owner for everything in the repo.
+*                               @Shopify/rust-cache-benchmarks-maintainers
+
+# Benchmark methodology and statistical reporting are the highest-trust
+# surface in this repo. Any change here affects the public comparison and
+# should get an extra reviewer who has context on the published numbers.
+/src/main.rs                    @Shopify/rust-cache-benchmarks-maintainers
+/src/caches/mod.rs              @Shopify/rust-cache-benchmarks-maintainers
+/README.md                      @Shopify/rust-cache-benchmarks-maintainers
+/CONTRIBUTING.md                @Shopify/rust-cache-benchmarks-maintainers
+
+# Security-sensitive files: anyone changing the audit allowlist or the CI
+# pipeline should get a security-aware review.
+/SECURITY.md                    @Shopify/rust-cache-benchmarks-maintainers
+/.cargo/audit.toml              @Shopify/rust-cache-benchmarks-maintainers
+/.github/workflows/             @Shopify/rust-cache-benchmarks-maintainers
+/.github/CODEOWNERS             @Shopify/rust-cache-benchmarks-maintainers
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a58dc50..2098222 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,28 +15,44 @@ env:
   CARGO_TERM_COLOR: always
   RUST_BACKTRACE: 1
 
+# All third-party actions are pinned to a 40-char commit SHA. The trailing
+# comment is the human-readable version Dependabot uses to keep the SHA up to
+# date (see .github/dependabot.yml — ecosystem `github-actions`). Never
+# downgrade these to floating tags or branches such as `@v4` or `@master`: a
+# compromise of the upstream action repo would otherwise execute attacker code
+# inside this workflow with the GITHUB_TOKEN scope.
 jobs:
   check:
-    name: Check
-    runs-on: ubuntu-latest
+    name: Check (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      # Don't let a flake on one OS hide a real bug on the other.
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
 
       - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@master
+        uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master @ 2026-05
         with:
           toolchain: "1.94"
           components: rustfmt, clippy
 
       - name: Cache cargo registry
-        uses: actions/cache@v4
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
         with:
           path: |
             ~/.cargo/registry
             ~/.cargo/git
             target
-          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: ${{ runner.os }}-cargo-
+          # Keys are scoped by OS, toolchain, and Cargo.lock. The fallback
+          # restore-key is also scoped by toolchain so a cache from a
+          # different Rust version is never partially mounted on top of
+          # this build (which has historically caused mysterious link
+          # errors on macOS runners).
+          key: ${{ runner.os }}-rust-1.94-cargo-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: ${{ runner.os }}-rust-1.94-cargo-
 
       - name: Check formatting
         run: cargo fmt --check
@@ -57,18 +73,20 @@ jobs:
     name: Audit
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
 
       - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@master
+        uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master @ 2026-05
         with:
           toolchain: "1.94"
 
       - name: Cache cargo-audit binary
-        uses: actions/cache@v4
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
         with:
           path: ~/.cargo/bin/cargo-audit
-          key: ${{ runner.os }}-cargo-audit-bin-v1
+          # Cache key includes toolchain so audit binaries built against
+          # an older Rust never silently land in a newer pipeline.
+          key: ${{ runner.os }}-rust-1.94-cargo-audit-bin-v1
 
       - name: Install cargo-audit
         run: |
@@ -77,6 +95,8 @@ jobs:
           fi
 
       - name: Audit dependencies
-        # cargo audit fails on vulnerabilities by default. Warnings (unmaintained
-        # crates, soundness advisories) are reported but do not fail the job.
-        run: cargo audit
+        # `--deny warnings` makes any *new* unmaintained or unsoundness
+        # advisory fail the build. Currently-accepted advisories are listed
+        # explicitly in .cargo/audit.toml with rationale and review dates;
+        # adding a new ignore there is a deliberate, reviewable code change.
+        run: cargo audit --deny warnings
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e306876..ebe9170 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -33,7 +33,8 @@ Thanks for your interest in contributing! `rust-cache-benchmarks` is a benchmark
 
 ## Local checks
 
-CI runs against Rust `1.94`. All of these must pass:
+CI runs against Rust `1.94` (the project's MSRV — see `rust-toolchain.toml`
+and `Cargo.toml`'s `rust-version`). All of these must pass:
 
 ```sh
 cargo fmt --check
@@ -41,6 +42,12 @@ cargo clippy --all-targets --locked -- -D warnings
 cargo build --locked
 cargo build --release --locked
 cargo test --locked
+
+# Supply-chain audit. CI runs this with `--deny warnings`, so any *new*
+# unmaintained or unsoundness advisory will fail the pipeline. Currently
+# accepted advisories are listed in .cargo/audit.toml with rationale.
+# `cargo install --locked cargo-audit` if you don't have it locally.
+cargo audit --deny warnings
 ```
 
 ## Running benchmarks
diff --git a/Cargo.lock b/Cargo.lock
index acc0243..331f46b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -151,7 +151,6 @@ dependencies = [
  "lru-mem",
  "mini-moka",
  "moka",
- "neocache",
  "parking_lot",
  "quick_cache",
  "rand 0.10.0",
@@ -1070,18 +1069,6 @@ dependencies = [
  "uuid",
 ]
 
-[[package]]
-name = "neocache"
-version = "0.1.0"
-source = "git+https://github.com/Shopify/neocache?rev=3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c#3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c"
-dependencies = [
- "ahash",
- "crossbeam-utils",
- "hashbrown 0.14.5",
- "lock_api",
- "parking_lot_core",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.19"
diff --git a/Cargo.toml b/Cargo.toml
index 96e537c..3b3b0b8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,11 +2,22 @@
 name = "cache-bench"
 version = "0.1.0"
 edition = "2021"
+# MSRV. Kept in sync with `rust-toolchain.toml` and the toolchain pinned in
+# `.github/workflows/ci.yml`. Bumping any of the three requires bumping the
+# other two.
+rust-version = "1.94"
 description = "Concurrent in-memory cache benchmarks for Rust"
 license = "MIT"
+readme = "README.md"
 repository = "https://github.com/Shopify/rust-cache-benchmarks"
 homepage = "https://github.com/Shopify/rust-cache-benchmarks"
 authors = ["Shopify Inc."]
+keywords = ["cache", "benchmark", "lru", "concurrent", "performance"]
+categories = ["caching", "development-tools::profiling"]
+# This is a benchmark harness, not a library. Block accidental `cargo publish`
+# from a maintainer's machine or a future release CI — the `cache-bench` name
+# is currently free on crates.io and we do not intend to claim it.
+publish = false
 
 [dependencies]
 schnellru = "0.2.4"
@@ -27,13 +38,17 @@ clru = "0.6.3"
 lru-mem = "0.3.0"
 sieve-cache = "1.1.6"
 caches = "0.3.0"
-# Pinned to a commit SHA so `cargo update` cannot silently roll the
-# benchmark forward to a future neocache release with different
-# performance characteristics. Bump deliberately when re-baselining.
-neocache = { git = "https://github.com/Shopify/neocache", rev = "3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c" }
 rand_distr = "0.6"
 parking_lot = "0.12"
 
+# `neocache` is intentionally not a dependency of this public repository.
+# The upstream crate lives in a private Shopify repo, so any reference to
+# it (even an `optional = true` git dependency) is recorded in `Cargo.lock`
+# and forces `cargo` to fetch the source on every build — which fails with
+# `failed to authenticate when downloading repository` on any clone
+# without Shopify git credentials, including public CI runners. See the
+# `Per-cache configuration deviations` section of `README.md`.
+
 [profile.release]
 opt-level = 3
 lto = "fat"
diff --git a/README.md b/README.md
index a926731..533e2e4 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,6 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a
 <img width="1198" height="490" alt="image" src="https://github.com/user-attachments/assets/c667ef70-1974-4c2a-846c-9c612bba7388" />
 
 
-
 ### Bachmark machine specs:
 
  | Component | Details |
@@ -25,26 +24,80 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a
 
 ## Caches compared
 
-| Crate | Strategy |
-|---|---|
-| [moka](https://crates.io/crates/moka) | TinyLFU / W-TinyLFU |
-| [mini-moka](https://crates.io/crates/mini-moka) | TinyLFU (lighter weight) |
-| [quick_cache](https://crates.io/crates/quick_cache) | LRU / CLOCK-Pro |
-| [schnellru](https://crates.io/crates/schnellru) | LRU |
-| [stretto](https://crates.io/crates/stretto) | Ristretto (TinyLFU) |
-| [lru](https://crates.io/crates/lru) | LRU |
-| [TinyUFO](https://crates.io/crates/TinyUFO) | TinyUFO |
-| [foyer](https://crates.io/crates/foyer) | S3-FIFO |
-| [cached](https://crates.io/crates/cached) | LRU (proc-macro) |
-| [clru](https://crates.io/crates/clru) | Count-Min LRU |
-| [lru-mem](https://crates.io/crates/lru-mem) | Memory-bounded LRU |
-| [sieve-cache](https://crates.io/crates/sieve-cache) | SIEVE |
-| [caches](https://crates.io/crates/caches) | Two-Queue |
-| [neocache](https://github.com/Shopify/neocache) | S3-FIFO + DashMap |
+The `Concurrency` column is a critical fairness disclosure: throughput numbers
+are dominated by lock topology as much as by eviction policy. Caches marked
+*sharded* serialise only within their shard; caches marked *single mutex* are
+wrapped by this benchmark in a `parking_lot::Mutex` because the underlying
+crate is not internally `Sync`-safe for concurrent writers. Compare
+like-for-like before drawing conclusions.
+
+| Crate | Strategy | Concurrency |
+|---|---|---|
+| [moka](https://crates.io/crates/moka) | TinyLFU / W-TinyLFU | sharded (built-in) |
+| [mini-moka](https://crates.io/crates/mini-moka) | TinyLFU (lighter weight) | sharded (built-in) |
+| [quick_cache](https://crates.io/crates/quick_cache) | LRU / CLOCK-Pro | sharded (built-in) |
+| [schnellru](https://crates.io/crates/schnellru) | LRU | single `Mutex` (this harness) |
+| [stretto](https://crates.io/crates/stretto) | Ristretto (TinyLFU) | sharded (built-in) - see footnote † |
+| [lru](https://crates.io/crates/lru) | LRU | single `Mutex` (this harness) |
+| [TinyUFO](https://crates.io/crates/TinyUFO) | TinyUFO | sharded (built-in) |
+| [foyer](https://crates.io/crates/foyer) | S3-FIFO | sharded (built-in) |
+| [cached](https://crates.io/crates/cached) | LRU (proc-macro) | single `Mutex` (this harness) |
+| [clru](https://crates.io/crates/clru) | Count-Min LRU | single `Mutex` (this harness) |
+| [lru-mem](https://crates.io/crates/lru-mem) | Memory-bounded LRU | single `Mutex` (this harness) - see footnote ‡ |
+| [sieve-cache](https://crates.io/crates/sieve-cache) | SIEVE | sharded (`ShardedSieveCache`) |
+| [caches](https://crates.io/crates/caches) | Two-Queue | single `Mutex` (this harness) — see footnote § |
+
+> **Note on `neocache`.** Earlier internal revisions of this harness also
+> benchmarked Shopify's in-house `neocache` crate (S3-FIFO + DashMap, built-in
+> sharding). It is intentionally absent from the public release: `neocache`
+> lives in a private repository, and even an `optional = true` git
+> dependency forces `cargo` to record the source in `Cargo.lock` and fetch
+> it on every build, which fails for any clone without Shopify git
+> credentials (including public CI runners). Reintroducing it requires a
+> private overlay branch that adds the dependency, the
+> `src/caches/neocache.rs` module, and matching entries in `ALL_CACHES` and
+> `dispatch` in `src/main.rs`.
+
+### Per-cache configuration deviations
+
+Every cache is constructed with default parameters *unless* listed here. Anything
+on this list is a deliberate choice we have made on behalf of the cache; the
+rationale is in the matching `src/caches/<name>.rs` file. If you believe one of
+these tunings biases the comparison, please file a methodology issue with a
+before/after diff.
+
+- **† stretto** - constructed with `num_counters = 10× cache_size`,
+  `set_ignore_internal_cost(true)`, and `set_buffer_size(num_tasks × 4 KiB)`.
+  Stretto additionally receives a unique 3-phase warmup (insert → read pass to
+  populate TinyLFU frequency counts → re-insert with TinyLFU-informed admission)
+  because its frequency sketch is updated only from reads, never from inserts;
+  a single-phase warmup leaves admission decisions effectively random and
+  collapses the hit rate to ~45%.
+- **‡ lru-mem** - capacity is byte-budgeted (`cache_size × value_size`). The
+  budget covers value bytes only; `String` headers and key-string storage are
+  not accounted for, so the effective entry count slightly exceeds
+  `cache_size`. Documented for transparency; correcting it would require
+  bespoke `HeapSize` accounting that other caches do not model.
+- **§ caches (Two-Queue)** - the [`caches`](https://crates.io/crates/caches)
+  crate has not seen a release since 2022 and pulls in `rand 0.8.5`. Included
+  for historical comparison; a future PR may retire it once a maintained
+  alternative exists.
+- **mini-moka** - unlike moka, mini-moka 0.10 does not expose a public flush
+  analogous to `run_pending_tasks()` (the equivalent `sync()` is private), so
+  warmup completion is not synchronous on this cache. The first measurement
+  iteration absorbs the residual eviction work; later iterations measure
+  steady state.
+- **lrucache, schnellru, clru, cached, two-queue, lru-mem** - wrapped in a
+  single `parking_lot::Mutex` because none of them is `Sync` for concurrent
+  writers. This makes them strictly worse than sharded caches under the
+  default 8-thread workload; the comparison is still meaningful as a
+  *deployment* signal ("do not use these from multiple writer threads"), not
+  as a pure algorithmic ranking.
 
 ## Requirements
 
-- Rust toolchain (pinned in `rust-toolchain.toml` — installed automatically via `rustup`)
+- Rust toolchain (MSRV 1.94, pinned in `rust-toolchain.toml`, `Cargo.toml`'s
+  `rust-version`, and the CI workflow). Installed automatically via `rustup`.
 - For best results, run on a quiet machine (no other heavy processes)
 
 ## Build and run
@@ -54,7 +107,7 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a
 cargo run --release
 
 # Run a subset
-cargo run --release -- --caches moka,quick_cache,neocache
+cargo run --release -- --caches moka,quick_cache,sieve
 
 # Override config parameters at runtime (no recompile needed)
 cargo run --release -- --size 10000 --zipf 0.9
@@ -76,19 +129,19 @@ cargo run --release -- --info
 
 | Column | Meaning |
 |---|---|
-| `ops/sec` | Median throughput across all measurement iterations — total operations (reads + writes) per wall-clock second |
+| `ops/sec` | Median throughput across all measurement iterations - total operations (reads + writes) per wall-clock second |
 | `±ci95` | 95% CI half-width as a percentage of the mean throughput (uses the actual `n` for this cache). Two caches whose CI ranges don't overlap are distinguishable at the 95% level |
 | `cv%` | Coefficient of variation (`stddev / mean × 100`). < 3% = stable; > 10% = noisy, flagged with `!` |
 | `n` | Actual number of measurement iterations run. Adaptive early stopping may converge before the 30-iteration maximum |
-| `p50µs` | Median of per-iteration p50 latencies in µs — the typical latency of a representative single run |
-| `p99µs` | Median of per-iteration p99 latencies in µs — avoids the pooling bias of aggregating all iterations before computing percentiles |
-| `tailµs` | Tail amplification = `p99µs − p50µs` (both medians) in µs. Lower = more consistent per-op cost |
-| `p99cv%` | CV% of per-iteration p99 latencies — measures tail latency *stability*; high = occasional spikes invisible in the median |
-| `wp99µs` | Median of per-iteration **write-only** p99 latencies in µs. Isolates write cost from the read-dominated `p99µs`; especially diagnostic for mutex-backed caches where writes hold an exclusive lock. Shows `--` if no write samples exist |
-| `hit%` | Cache hit rate — fraction of reads that found the key; purely a function of eviction policy and access pattern |
+| `p50μs` | Median of per-iteration p50 latencies in μs - the typical latency of a representative single run |
+| `p99μs` | Median of per-iteration p99 latencies in μs - avoids the pooling bias of aggregating all iterations before computing percentiles |
+| `tailμs` | Tail amplification = `p99μs - p50μs` (both medians) in μs. Lower = more consistent per-op cost |
+| `p99cv%` | CV% of per-iteration p99 latencies - measures tail latency *stability*; high = occasional spikes invisible in the median |
+| `wp99μs` | Median of per-iteration **write-only** p99 latencies in μs. Isolates write cost from the read-dominated `p99μs`; especially diagnostic for mutex-backed caches where writes hold an exclusive lock. Shows `--` if no write samples exist |
+| `hit%` | Cache hit rate - fraction of reads that found the key; purely a function of eviction policy and access pattern |
 | `±hitci` | 95% CI half-width for hit rate in absolute percentage points (e.g. `±0.020` = ±0.020 pp). Three decimal places avoids the misleading `±0.00` display when CI is very tight but nonzero |
 
-> **Note on latency floor:** Each sampled latency is bracketed by two TSC reads (~10–20 ns each on Linux via vDSO). The combined ~20–40 ns overhead inflates `p50µs`/`p99µs` for the fastest caches. Use latency numbers for relative comparison, not absolute wall-clock cost.
+> **Note on latency floor:** Each sampled latency is bracketed by two TSC reads (~10-20 ns each on Linux via vDSO). The combined ~20-40 ns overhead inflates `p50μs`/`p99μs` for the fastest caches. Use latency numbers for relative comparison, not absolute wall-clock cost.
 
 ## Configuration
 
@@ -103,7 +156,7 @@ All parameters have sensible defaults and can be overridden via CLI flags withou
 | `value_size` | `--value-size` | 5,120 B | 5 KB cached values |
 | `num_iterations` | `--iters` | 30 (max) | Hard cap; adaptive early stopping may converge sooner |
 | `warmup_iterations` | `--warmup` | 1 | One ~3 s pass primes branch predictors and engages Turbo Boost |
-| `zipf_exponent` | `--zipf` | 1.07 | ≈ 80/20 hotspot skew; sweep 0.8–1.5 |
+| `zipf_exponent` | `--zipf` | 1.07 | ≈ 80/20 hotspot skew; sweep 0.8-1.5 |
 | `access_pattern` | `--pattern` | zipfian | `zipfian`, `uniform`, or `sequential` |
 | `cold_start` | `--cold-start` | false | Start with empty cache |
 | `latency_sample_every` | `--sample-every` | 10 | Bernoulli sampling rate (~10% of ops) |
@@ -113,17 +166,22 @@ All parameters have sensible defaults and can be overridden via CLI flags withou
 
 ## Benchmark methodology
 
-- **Task-parallel, OS-thread model** — each task runs in a `tokio::task::spawn_blocking` thread for true parallelism
-- **Barrier-synchronized start** — all tasks start simultaneously; throughput = `total_ops / wall_time_of_slowest_task`
-- **Per-cache ops calibration** — a 200 K-op/task calibration pass before warmup measures each cache's throughput and sets `num_keys_per_task = clamp(3 s × throughput / num_tasks, 500 K, 2 M)`, targeting ~3 s wall time per iteration; prevents slow global-mutex caches from dominating total runtime
-- **Per-iteration randomised ordering** — the remaining active caches are shuffled each iteration to eliminate thermal/position bias
-- **Adaptive early stopping** — after each iteration, any cache with ≥ 15 samples and 95% CI < 0.75% of mean is retired; the `n` column shows actual iterations run
-- **Median throughput** — robust to single-iteration outliers from thermal throttling or OS scheduling
-- **95% confidence intervals** — reported on both throughput (`±ci95`) and hit rate (`±hitci`)
-- **Per-iteration latency percentiles** — p50/p99 computed per iteration then summarised with the median; avoids the pooling bias of aggregating all samples before computing percentiles
-- **Bernoulli latency sampling** — each op is sampled with probability 1/N from the task's existing RNG; avoids phase-locking with periodic cache slow paths that deterministic modulo sampling can miss
-- **Noisy-run flagging** — throughput CV% > 10 is marked with `!` in the output table; a footnote is printed below the table when triggered
+- **Task-parallel, OS-thread model** - each task runs in a `tokio::task::spawn_blocking` thread for true parallelism
+- **Barrier-synchronized start** - all tasks start simultaneously; throughput = `total_ops / wall_time_of_slowest_task`
+- **Per-cache ops calibration** - a 200 K-op/task calibration pass before warmup measures each cache's throughput and sets `num_keys_per_task = clamp(3 s × throughput / num_tasks, 500 K, 2 M)`, targeting ~3 s wall time per iteration; prevents slow global-mutex caches from dominating total runtime
+- **Per-iteration randomised ordering** - the remaining active caches are shuffled each iteration to eliminate thermal/position bias
+- **Adaptive early stopping** - after each iteration, any cache with ≥ 15 samples and 95% CI < 0.75% of mean is retired; the `n` column shows actual iterations run
+- **Median throughput** - robust to single-iteration outliers from thermal throttling or OS scheduling
+- **95% confidence intervals** - reported on both throughput (`±ci95`) and hit rate (`±hitci`)
+- **Per-iteration latency percentiles** - p50/p99 computed per iteration then summarised with the median; avoids the pooling bias of aggregating all samples before computing percentiles
+- **Bernoulli latency sampling** - each op is sampled with probability 1/N from the task's existing RNG; avoids phase-locking with periodic cache slow paths that deterministic modulo sampling can miss
+- **Noisy-run flagging** - throughput CV% > 10 is marked with `!` in the output table; a footnote is printed below the table when triggered
+
+## Reporting security issues
+
+Please do not open public issues for security reports. See [SECURITY.md](SECURITY.md)
+for private disclosure channels.
 
 ## License
 
-MIT
+Licensed under the [MIT License](LICENSE).
diff --git a/docs/assets/README.md b/docs/assets/README.md
new file mode 100644
index 0000000..1c993bd
--- /dev/null
+++ b/docs/assets/README.md
@@ -0,0 +1,26 @@
+# Assets
+
+This directory holds README assets that need to be versioned with the repo
+(screenshots, charts, banners) so they cannot break independently of the code.
+
+## Currently expected files
+
+| Path | Purpose |
+|---|---|
+| `benchmark-output.png` | Hero screenshot of `cargo run --release` output, referenced from the top of `README.md`. |
+
+## How to refresh the screenshot
+
+1. On a quiet machine matching the spec table in `README.md`, run:
+   ```sh
+   cargo run --release > /tmp/bench.txt
+   ```
+2. Take a screenshot of the rendered table (the binary uses ANSI escapes for
+   colour, so a terminal screenshot is more readable than the raw text).
+3. Save as `docs/assets/benchmark-output.png`. Keep it under ~500 KB; resize
+   to a reasonable display width (1300\u20131600 px is fine).
+4. Commit. The README link is a relative path, so no further changes needed.
+
+Do **not** re-link the README to `github.com/user-attachments/...` URLs:
+those are tied to a single uploader and disappear if the user is removed
+or the upload is purged.
diff --git a/src/caches/minimoka.rs b/src/caches/minimoka.rs
index 41774f8..f48a955 100644
--- a/src/caches/minimoka.rs
+++ b/src/caches/minimoka.rs
@@ -20,8 +20,12 @@ pub async fn run_bench(
             let value = value_pool[key_idx % value_pool.len()].clone();
             cache.insert(key_pool[key_idx].clone(), value);
         }
-        // Flush deferred eviction decisions before measurement, matching moka's
-        // run_pending_tasks() call and ensuring a consistent steady-state start.
+        // mini-moka 0.10 does NOT expose a public flush analogous to
+        // moka::sync::Cache::run_pending_tasks(); the equivalent `sync()` is
+        // private. Pending eviction work may therefore still be in-flight when
+        // measurement begins. The first warmup iteration absorbs most of it,
+        // but mini-moka's read of `entry_count` shortly after warmup may lag
+        // behind moka's. Documented here so the asymmetry is not silent.
     }
 
     let cache = Arc::new(cache);
diff --git a/src/caches/mod.rs b/src/caches/mod.rs
index 10535e0..b9848be 100644
--- a/src/caches/mod.rs
+++ b/src/caches/mod.rs
@@ -11,7 +11,12 @@ pub mod lrucache;
 pub mod lrumem;
 pub mod minimoka;
 pub mod moka;
-pub mod neocache;
+// `neocache` is intentionally not benchmarked from this public repo:
+// the upstream crate lives in a private Shopify repo and so cannot be
+// cloned by external users or CI runners. See the comment in
+// `Cargo.toml` and the README. To benchmark `neocache` internally,
+// keep the dependency, this `pub mod`, and the corresponding entry in
+// `ALL_CACHES`/`dispatch` in a private overlay branch.
 pub mod quick_cache;
 pub mod schnellru;
 pub mod sieve;
@@ -54,7 +59,14 @@ pub struct BenchConfig {
     pub access_pattern: AccessPattern,
     /// Zipf skew exponent: 1.07 ≈ 80/20 hotspot; higher = more skewed
     pub zipf_exponent: f64,
-    /// Skip pre-population warmup to measure cold ramp-up; false = steady-state
+    /// Skip pre-population warmup to measure cold ramp-up; false = steady-state.
+    ///
+    /// Also set internally to `true` for the calibration pass so that
+    /// calibration measures pure hot-loop throughput, not
+    /// `(pre-pop cost + hot-loop) / wall_time` — caches with expensive
+    /// pre-pop (e.g. stretto's 3-phase warmup, lru-mem's per-byte tracking)
+    /// would otherwise be under-calibrated and assigned a smaller
+    /// `num_keys_per_task` than is actually warranted.
     pub cold_start: bool,
     /// Minimum key string length (zero-padded); 0 = natural numeric length
     pub key_size: usize,
@@ -79,9 +91,13 @@ pub struct BenchResults {
     pub write_latencies_ns: Vec<u64>, // Sampled write-only latencies in nanoseconds
 }
 
-pub fn generate_fixed_value(size: usize) -> String {
-    rand::rng()
-        .sample_iter(&rand::distr::Alphanumeric)
+/// Generate a single random alphanumeric string of `size` bytes from the
+/// caller-supplied RNG. Taking the RNG by reference lets the value pool be
+/// generated deterministically from the benchmark's base seed instead of from
+/// an unseeded thread RNG — a precondition of the project's reproducibility
+/// guarantee documented in `--info`.
+pub fn generate_fixed_value(rng: &mut impl rand::Rng, size: usize) -> String {
+    rng.sample_iter(&rand::distr::Alphanumeric)
         .take(size)
         .map(char::from)
         .collect()
@@ -89,9 +105,24 @@ pub fn generate_fixed_value(size: usize) -> String {
 
 /// Pre-allocates a pool of values to eliminate allocation overhead during benchmarking.
 /// Uses Arc<String> to avoid expensive clones — only the Arc is cloned (8 bytes), not the data.
-pub fn generate_value_pool(pool_size: usize, value_size: usize) -> Arc<Vec<Arc<String>>> {
+///
+/// `seed` is taken from `BenchConfig::rng_seed`, with a fixed offset so it
+/// cannot collide with the per-task seeds derived as `(rng_seed + task_id + 1)`.
+/// Re-running with the same seed therefore produces bit-identical value bytes,
+/// which is what `--info` claims and what before/after diff comparisons rely on.
+pub fn generate_value_pool(
+    pool_size: usize,
+    value_size: usize,
+    seed: u64,
+) -> Arc<Vec<Arc<String>>> {
+    // Offset chosen to leave the [seed, seed + num_tasks] range free for the
+    // per-task RNGs in every cache implementation. 0xC0FFEE is arbitrary
+    // — the only requirement is that it is large enough to not collide with
+    // realistic task counts.
+    const VALUE_POOL_SEED_OFFSET: u64 = 0x00C0_FFEE;
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed.wrapping_add(VALUE_POOL_SEED_OFFSET));
     let pool: Vec<Arc<String>> = (0..pool_size)
-        .map(|_| Arc::new(generate_fixed_value(value_size)))
+        .map(|_| Arc::new(generate_fixed_value(&mut rng, value_size)))
         .collect();
     Arc::new(pool)
 }
@@ -120,7 +151,27 @@ pub fn generate_key_pool(num_keys: usize, key_size: usize) -> Arc<Vec<String>> {
 }
 
 /// Returns the number of primary benchmark keys (the Zipf keyspace).
-/// The full key pool is 2× this value to accommodate write_new_key_fraction.
+/// The full key pool is 2× this value to accommodate `write_new_key_fraction`.
+///
+/// ## Fresh-key window (write_new_key_fraction > 0)
+///
+/// Each cache implementation computes a fresh-key index as:
+///
+/// ```text
+///     idx = total_keys + ((seq_start + fresh_write_count) % total_keys)
+/// ```
+///
+/// so `idx` lives in `[total_keys, 2 × total_keys)`. Because the modulus is
+/// `total_keys` (not `total_keys * num_tasks`), two tasks with identical
+/// `seq_start mod total_keys` will rotate through the **same** fresh-key
+/// indices. With the default 8 tasks × 2 M ops over a 480 K Zipf keyspace,
+/// each task wraps the fresh-key window every ~60 K ops.
+///
+/// This is intentional: the goal of `write_new_key_fraction` is to model
+/// insert-heavy workloads (event ingestion, telemetry firehoses) by injecting
+/// keys outside the Zipf hot-set, not to allocate a fully unique fresh-key
+/// stream per task. Documenting it here so the bound on the fresh-key pool
+/// is auditable and the wraparound is not surprising.
 pub fn total_benchmark_keys(cfg: &BenchConfig) -> usize {
     cfg.num_distinct_keys
 }
@@ -223,7 +274,9 @@ pub fn calculate_statistics(values: &[f64]) -> (f64, f64, f64, f64) {
     let mean = values.iter().sum::<f64>() / values.len() as f64;
 
     let mut sorted = values.to_vec();
-    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    // total_cmp instead of partial_cmp so a stray NaN sorts deterministically
+    // to one end rather than panicking the whole benchmark summary.
+    sorted.sort_by(|a, b| a.total_cmp(b));
     let n = sorted.len();
     let median = if n.is_multiple_of(2) {
         (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
diff --git a/src/caches/neocache.rs b/src/caches/neocache.rs
deleted file mode 100644
index f4c0025..0000000
--- a/src/caches/neocache.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-use crate::caches::{total_benchmark_keys, BenchConfig, BenchResults, KeyGenerator};
-use neocache::NeoCache;
-use rand::{RngExt, SeedableRng};
-use std::sync::Arc;
-use std::time::Instant;
-
-/// NeoCache: concurrent HashMap with built-in S3-FIFO cache eviction.
-/// Unlike the DashMap benchmark, no manual entry_count tracking or random
-/// eviction is required — S3-FIFO eviction runs automatically per-shard on
-/// every insert once the cache reaches capacity.
-pub async fn run_bench(
-    cfg: Arc<BenchConfig>,
-    value_pool: Arc<Vec<Arc<String>>>,
-    key_pool: Arc<Vec<String>>,
-) -> BenchResults {
-    let cache: Arc<NeoCache<String, Arc<String>>> = Arc::new(NeoCache::new(cfg.cache_size));
-    let total_keys = total_benchmark_keys(&cfg);
-
-    if !cfg.cold_start {
-        let warmup_key_gen = KeyGenerator::new(cfg.access_pattern, total_keys, cfg.zipf_exponent);
-        let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.rng_seed);
-        for i in 0..total_keys {
-            let key_idx = warmup_key_gen.next_key(i, &mut rng);
-            let value = value_pool[key_idx % value_pool.len()].clone();
-            cache.insert(key_pool[key_idx].clone(), value);
-        }
-        // No eviction trim needed: S3-FIFO enforces cache_capacity automatically.
-    }
-
-    let key_gen = Arc::new(KeyGenerator::new(
-        cfg.access_pattern,
-        total_keys,
-        cfg.zipf_exponent,
-    ));
-    let barrier = Arc::new(std::sync::Barrier::new(cfg.num_tasks));
-
-    let tasks: Vec<_> = (0..cfg.num_tasks)
-        .map(|i| {
-            let my_cache = cache.clone();
-            let value_pool = value_pool.clone();
-            let key_pool = key_pool.clone();
-            let key_gen = key_gen.clone();
-            let seq_start = i * cfg.num_keys_per_task;
-            let seq_end = (i + 1) * cfg.num_keys_per_task;
-            let cfg = cfg.clone();
-            let barrier = barrier.clone();
-
-            tokio::task::spawn_blocking(move || {
-                let mut local_latencies =
-                    Vec::with_capacity(cfg.num_keys_per_task / cfg.latency_sample_every + 64);
-                let mut local_write_latencies =
-                    Vec::with_capacity(cfg.num_keys_per_task / cfg.latency_sample_every / 5 + 64);
-                let mut local_reads = 0usize;
-                let mut local_hits = 0usize;
-                let mut fresh_write_count = 0usize;
-                let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.rng_seed + i as u64 + 1);
-
-                barrier.wait();
-                let task_start = Instant::now();
-
-                for seq_key in seq_start..seq_end {
-                    let key_idx = key_gen.next_key(seq_key, &mut rng);
-                    let k = &key_pool[key_idx];
-
-                    let is_read = rng.random_bool(cfg.read_write_ratio);
-                    let should_measure = rng.random_bool(1.0 / cfg.latency_sample_every as f64);
-
-                    if is_read {
-                        local_reads += 1;
-                        let op_start = if should_measure {
-                            Some(Instant::now())
-                        } else {
-                            None
-                        };
-                        let hit = my_cache.get(k);
-                        if let Some(t) = op_start {
-                            local_latencies.push(t.elapsed().as_nanos() as u64);
-                        }
-                        if let Some(value) = hit {
-                            local_hits += 1;
-                            let checksum: u32 = value
-                                .as_bytes()
-                                .iter()
-                                .step_by(256)
-                                .map(|&b| b as u32)
-                                .sum();
-                            std::hint::black_box(checksum);
-                        }
-                    } else {
-                        let write_key_idx = if cfg.write_new_key_fraction > 0.0
-                            && rng.random_bool(cfg.write_new_key_fraction)
-                        {
-                            let idx = total_keys + ((seq_start + fresh_write_count) % total_keys);
-                            fresh_write_count += 1;
-                            idx
-                        } else {
-                            key_idx
-                        };
-                        let k_write = &key_pool[write_key_idx];
-                        let value = value_pool[write_key_idx % value_pool.len()].clone();
-                        let op_start = if should_measure {
-                            Some(Instant::now())
-                        } else {
-                            None
-                        };
-                        my_cache.insert(k_write.clone(), value);
-                        if let Some(t) = op_start {
-                            let ns = t.elapsed().as_nanos() as u64;
-                            local_latencies.push(ns);
-                            local_write_latencies.push(ns);
-                        }
-                    }
-                }
-
-                (
-                    local_latencies,
-                    local_write_latencies,
-                    local_reads,
-                    local_hits,
-                    task_start.elapsed(),
-                )
-            })
-        })
-        .collect();
-
-    let mut all_latencies = Vec::new();
-    let mut all_write_latencies = Vec::new();
-    let mut total_reads = 0usize;
-    let mut total_hits = 0usize;
-    let mut max_duration = std::time::Duration::ZERO;
-    for handle in tasks {
-        let (lat, wlat, reads, hits, duration) = handle.await.unwrap();
-        all_latencies.extend(lat);
-        all_write_latencies.extend(wlat);
-        total_reads += reads;
-        total_hits += hits;
-        max_duration = max_duration.max(duration);
-    }
-
-    let total_ops = cfg.num_tasks * cfg.num_keys_per_task;
-    let throughput = total_ops as f64 / max_duration.as_secs_f64();
-    let hit_rate = if total_reads > 0 {
-        total_hits as f64 / total_reads as f64
-    } else {
-        0.0
-    };
-
-    BenchResults {
-        throughput,
-        hit_rate,
-        total_ops,
-        total_reads,
-        total_hits,
-        latencies_ns: all_latencies,
-        write_latencies_ns: all_write_latencies,
-    }
-}
diff --git a/src/main.rs b/src/main.rs
index fc10219..2458a08 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,16 +1,61 @@
 use rand::{seq::SliceRandom, SeedableRng};
 use std::collections::HashMap;
+use std::io::IsTerminal;
 use std::sync::Arc;
+use std::sync::OnceLock;
 
 use crate::caches::*;
 
-// Raw ANSI sequences — no crate dependency.
-const DIM: &str = "\x1b[2m";
-const RESET: &str = "\x1b[0m";
-const BOLD: &str = "\x1b[1m";
-const GREEN: &str = "\x1b[32m";
-const YELLOW: &str = "\x1b[33m";
-const CYAN: &str = "\x1b[36m";
+// ANSI colour escapes are written through these helpers so they collapse to
+// empty strings when stdout is not a TTY (piped to a file, captured by CI).
+// Writing the escape bytes to a non-terminal sink produces literal `\x1b[2m`
+// in the output, which would be unhelpful for anyone diffing benchmark logs.
+fn ansi_enabled() -> bool {
+    static CACHED: OnceLock<bool> = OnceLock::new();
+    *CACHED.get_or_init(|| std::io::stdout().is_terminal())
+}
+fn dim() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[2m"
+    } else {
+        ""
+    }
+}
+fn reset() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[0m"
+    } else {
+        ""
+    }
+}
+fn bold() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[1m"
+    } else {
+        ""
+    }
+}
+fn green() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[32m"
+    } else {
+        ""
+    }
+}
+fn yellow() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[33m"
+    } else {
+        ""
+    }
+}
+fn cyan() -> &'static str {
+    if ansi_enabled() {
+        "\x1b[36m"
+    } else {
+        ""
+    }
+}
 
 mod caches;
 
@@ -31,7 +76,10 @@ const ALL_CACHES: &[&str] = &[
     "lrumem",
     "sieve",
     "two_queue",
-    "neocache",
+    // `neocache` is deliberately not in this public list — see the
+    // comment in `caches/mod.rs` and `Cargo.toml` for why. Internal
+    // users carrying the overlay should add the entry back here in
+    // the same alphabetical-by-introduction order.
 ];
 
 /// Dispatch a single benchmark run by cache name.
@@ -55,7 +103,6 @@ async fn dispatch(
         "lrumem" => lrumem::run_bench(cfg, value_pool, key_pool).await,
         "sieve" => sieve::run_bench(cfg, value_pool, key_pool).await,
         "two_queue" => two_queue::run_bench(cfg, value_pool, key_pool).await,
-        "neocache" => neocache::run_bench(cfg, value_pool, key_pool).await,
         // Cache names are validated against `ALL_CACHES` in `parse_args`, so any
         // unknown name here indicates a programmer error (a name was added to
         // `ALL_CACHES` without a matching arm here).
@@ -73,8 +120,11 @@ async fn run_benchmark_suite(
     let spinners = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
     let mut spin_idx: usize = 0;
 
-    // Pre-allocate value pool once for all benchmarks
-    let value_pool = generate_value_pool(cfg.value_pool_size, cfg.value_size);
+    // Pre-allocate value pool once for all benchmarks. Seeded from the same
+    // base seed as the rest of the benchmark, so re-running with the same
+    // `--seed` produces bit-identical value bytes (matching the
+    // reproducibility guarantee printed by `--info`).
+    let value_pool = generate_value_pool(cfg.value_pool_size, cfg.value_size, cfg.rng_seed);
     let key_pool = generate_key_pool(total_keys * 2, cfg.key_size);
 
     let active: Vec<&'static str> = ALL_CACHES
@@ -97,8 +147,15 @@ async fn run_benchmark_suite(
     const MIN_OPS: usize = 500_000;
     const MAX_OPS: usize = 2_000_000;
 
+    // Calibration runs with `cold_start = true` so we measure pure hot-loop
+    // throughput, not (pre-pop cost + hot-loop) / wall_time. Caches with
+    // expensive pre-population (stretto's 3-phase warmup, lru-mem's per-byte
+    // tracking) would otherwise be under-calibrated relative to caches with
+    // cheap pre-pop, giving them a smaller `num_keys_per_task` budget than
+    // their steady-state throughput justifies.
     let cal_cfg = Arc::new(BenchConfig {
         num_keys_per_task: CAL_OPS,
+        cold_start: true,
         ..(*cfg).clone()
     });
     let mut per_cache_cfg: HashMap<&'static str, Arc<BenchConfig>> = HashMap::new();
@@ -114,7 +171,9 @@ async fn run_benchmark_suite(
                 sp,
                 cal_idx + 1,
                 total_active,
-                name
+                name,
+                DIM = dim(),
+                RESET = reset(),
             )
         );
         let _ = std::io::Write::flush(&mut std::io::stdout());
@@ -145,7 +204,9 @@ async fn run_benchmark_suite(
                     sp,
                     i + 1,
                     cfg.warmup_iterations,
-                    name
+                    name,
+                    YELLOW = yellow(),
+                    RESET = reset(),
                 )
             );
             let _ = std::io::Write::flush(&mut std::io::stdout());
@@ -183,7 +244,15 @@ async fn run_benchmark_suite(
             spin_idx += 1;
             print!(
                 "{:<65}\r",
-                format!("  {} {CYAN}{BOLD}run{RESET} {} {}...", sp, n, name)
+                format!(
+                    "  {} {CYAN}{BOLD}run{RESET} {} {}...",
+                    sp,
+                    n,
+                    name,
+                    CYAN = cyan(),
+                    BOLD = bold(),
+                    RESET = reset(),
+                )
             );
             let _ = std::io::Write::flush(&mut std::io::stdout());
             let result = dispatch(
@@ -216,38 +285,61 @@ fn check_help_flag() -> bool {
     std::env::args().any(|a| a == "--help" || a == "-h")
 }
 
+/// Single source of truth for the user-facing flag table.
+///
+/// Both `print_help` (`--help`) and `print_info` (`--info`) render from this
+/// list so they cannot drift apart. When adding or renaming a flag, update
+/// **only** this table plus the matching arm in `parse_args`.
+struct FlagDoc {
+    /// Human-readable invocation form, e.g. `"--size N"` or `"--caches / -c LIST"`.
+    form: &'static str,
+    /// One-line description suitable for both the brief and full references.
+    /// Includes the default value where applicable.
+    desc: &'static str,
+}
+
+#[rustfmt::skip]
+const FLAG_TABLE: &[FlagDoc] = &[
+    FlagDoc { form: "--size N",            desc: "cache_size (default: 30000)" },
+    FlagDoc { form: "--keys N",            desc: "num_distinct_keys (default: 480000, i.e. 16× cache_size)" },
+    FlagDoc { form: "--tasks N",           desc: "num_tasks (default: available_parallelism)" },
+    FlagDoc { form: "--iters N",           desc: "num_iterations max (default: 30; adaptive stopping may run fewer)" },
+    FlagDoc { form: "--warmup N",          desc: "warmup_iterations (default: 1)" },
+    FlagDoc { form: "--ratio F",           desc: "read_write_ratio 0.0–1.0 (default: 0.80)" },
+    FlagDoc { form: "--value-size N",      desc: "value size in bytes (default: 5120)" },
+    FlagDoc { form: "--zipf F",            desc: "Zipf exponent (default: 1.07)  sweep 0.8–1.5" },
+    FlagDoc { form: "--pattern P",         desc: "access pattern: zipfian|uniform|sequential (default: zipfian)" },
+    FlagDoc { form: "--cold-start",        desc: "start with empty cache instead of pre-populated" },
+    FlagDoc { form: "--key-size N",        desc: "minimum key string width, zero-padded (default: 0)" },
+    FlagDoc { form: "--write-new F",       desc: "fraction of writes inserting a fresh key (default: 0.0)" },
+    FlagDoc { form: "--seed N",            desc: "RNG seed (default: 42)" },
+    FlagDoc { form: "--pool-size N",       desc: "value pool size (default: 10000)" },
+    FlagDoc { form: "--sample-every N",    desc: "latency sampling rate 1/N (default: 10)" },
+    FlagDoc { form: "--caches / -c LIST",  desc: "comma-separated list of caches to run" },
+    FlagDoc { form: "--info",              desc: "full reference: config, methodology, column definitions" },
+    FlagDoc { form: "--help / -h",         desc: "print brief flag summary" },
+];
+
+fn print_flag_table() {
+    for f in FLAG_TABLE {
+        println!("  {:<19} {}", f.form, f.desc);
+    }
+}
+
 fn print_help() {
     println!("rust-cache-benchmarks — concurrent in-memory cache benchmark\n");
     println!("USAGE:");
     println!("  cargo run --release [-- [FLAGS]]\n");
     println!("FLAGS:");
-    println!("  --size N            cache_size (default: 30000)");
-    println!("  --keys N            num_distinct_keys (default: 480000, i.e. 16× cache_size)");
-    println!("  --tasks N           num_tasks (default: available_parallelism)");
-    println!(
-        "  --iters N           num_iterations max (default: 30; adaptive stopping may run fewer)"
-    );
-    println!("  --warmup N          warmup_iterations (default: 1)");
-    println!("  --ratio F           read_write_ratio 0.0–1.0 (default: 0.80)");
-    println!("  --value-size N      value size in bytes (default: 5120)");
-    println!("  --zipf F            Zipf exponent (default: 1.07)  sweep 0.8–1.5");
-    println!("  --pattern P         access pattern: zipfian|uniform|sequential (default: zipfian)");
-    println!("  --cold-start        start with empty cache instead of pre-populated");
-    println!("  --key-size N        minimum key string width, zero-padded (default: 0)");
-    println!("  --write-new F       fraction of writes inserting a fresh key (default: 0.0)");
-    println!("  --seed N            RNG seed (default: 42)");
-    println!("  --pool-size N       value pool size (default: 10000)");
-    println!("  --sample-every N    latency sampling rate 1/N (default: 10)");
-    println!("  --caches / -c LIST  comma-separated list of caches to run");
-    println!("  --info              full reference: config, methodology, column definitions");
-    println!("  --help / -h         print this help\n");
+    print_flag_table();
+    println!();
     println!("  All flags accept both space-separated and = forms: --zipf 1.2  --zipf=1.2\n");
     println!("AVAILABLE CACHES:");
     println!("  moka, minimoka, quick_cache, schnellru, stretto, lrucache, tinyufo,");
-    println!("  foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue, neocache\n");
+    println!("  foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue\n");
     println!("EXAMPLES:");
     println!("  cargo run --release                                    # run all caches");
-    println!("  cargo run --release -- -c moka,neocache               # run a subset");
+    println!("  cargo run --release -- -c moka,quick_cache            # run a subset");
     println!("  cargo run --release -- --size 10000 --zipf 0.9         # custom config");
     println!("  cargo run --release -- --pattern uniform --ratio 0.5   # different workload");
     println!("  cargo run --release -- --info                          # full reference\n");
@@ -472,6 +564,19 @@ fn parse_args() -> Result<(BenchConfig, std::collections::HashSet<String>), Stri
         i += 1;
     }
 
+    // Cross-flag validation. A keyspace smaller than the cache means the
+    // cache will admit every key with no eviction pressure at all — the
+    // benchmark stops measuring eviction policy quality and becomes a
+    // hash-table micro-benchmark, which is not what this harness exists for.
+    // Defaults set num_distinct_keys = 16 × cache_size; require at least 2×
+    // for any user override so the comparison stays meaningful.
+    if num_distinct_keys < cache_size * 2 {
+        return Err(format!(
+            "--keys ({num_distinct_keys}) must be at least 2× --size ({cache_size}) so the \
+             cache experiences eviction pressure. Recommended ratio is 16× (the default)."
+        ));
+    }
+
     let config = BenchConfig {
         cache_size,
         num_tasks,
@@ -818,35 +923,19 @@ fn print_info() {
 
     println!("━━━  CLI FLAGS  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
 
-    println!("  --size N            cache_size (default: 30000)");
-    println!("  --keys N            num_distinct_keys (default: 480000, i.e. 16× cache_size)");
-    println!("  --tasks N           num_tasks (default: available_parallelism)");
-    println!("  --iters N           num_iterations / max iterations (default: 30)");
-    println!("  --warmup N          warmup_iterations (default: 1)");
-    println!("  --ratio F           read_write_ratio 0.0–1.0 (default: 0.80)");
-    println!("  --value-size N      value size in bytes (default: 5120)");
-    println!("  --zipf F            Zipf exponent (default: 1.07)  sweep 0.8–1.5");
-    println!("  --pattern P         access pattern: zipfian|uniform|sequential (default: zipfian)");
-    println!("  --cold-start        start with empty cache instead of pre-populated");
-    println!("  --key-size N        minimum key string width, zero-padded (default: 0)");
-    println!("  --write-new F       fraction of writes inserting a fresh key (default: 0.0)");
-    println!("  --seed N            RNG seed (default: 42)");
-    println!("  --pool-size N       value pool size (default: 10000)");
-    println!("  --sample-every N    latency sampling rate 1/N (default: 10)");
-    println!("  --caches / -c LIST  comma-separated list of caches to run");
-    println!("  --info              print this full reference");
-    println!("  --help / -h         print brief flag summary\n");
+    print_flag_table();
+    println!();
 
     println!("  All flags accept both space-separated and = forms:");
     println!("    --zipf 1.2    --zipf=1.2\n");
 
     println!("  Available caches:");
     println!("    moka, minimoka, quick_cache, schnellru, stretto, lrucache, tinyufo,");
-    println!("    foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue, neocache\n");
+    println!("    foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue\n");
 
     println!("  Usage:");
     println!("    cargo run --release                                    # run all caches");
-    println!("    cargo run --release -- -c moka,neocache               # run a subset");
+    println!("    cargo run --release -- -c moka,quick_cache            # run a subset");
     println!("    cargo run --release -- --size 10000 --zipf 0.9         # custom config");
     println!("    cargo run --release -- --pattern uniform --ratio 0.5   # different workload");
     println!("    cargo run --release -- --info                          # print this reference\n");
@@ -934,12 +1023,11 @@ async fn main() {
         .map(|(name, results)| (*name, AggregatedStats::from_results(results)))
         .collect();
 
-    // Sort by median throughput (descending)
-    stats_map.sort_by(|a, b| {
-        b.1.throughput_median
-            .partial_cmp(&a.1.throughput_median)
-            .unwrap()
-    });
+    // Sort by median throughput (descending). `total_cmp` instead of
+    // `partial_cmp().unwrap()` so a NaN (which today cannot occur, but
+    // `total_cmp` is one character longer and one defensive line shorter)
+    // would simply sort to the end rather than panic the binary.
+    stats_map.sort_by(|a, b| b.1.throughput_median.total_cmp(&a.1.throughput_median));
 
     // Print combined results table
     println!(
@@ -955,7 +1043,9 @@ async fn main() {
         "p99cv%",
         "wp99µs",
         "hit%",
-        "±hitci"
+        "±hitci",
+        BOLD = bold(),
+        RESET = reset(),
     );
     println!("{}", "─".repeat(99));
 
@@ -998,24 +1088,25 @@ async fn main() {
         println!("  (!) cv% > 10%: throughput is noisy — consider rerunning on a quieter system");
     }
 
-    println!("{DIM}  legend: ops/sec=throughput  ±ci95=95% CI  cv%=variability  tailµs=p99−p50  wp99µs=write p99  ±hitci=hit-rate CI{RESET}");
+    println!(
+        "{DIM}  legend: ops/sec=throughput  ±ci95=95% CI  cv%=variability  tailµs=p99−p50  wp99µs=write p99  ±hitci=hit-rate CI{RESET}",
+        DIM = dim(),
+        RESET = reset(),
+    );
 
-    // Overall winners
+    // Overall winners. `total_cmp` everywhere so a NaN (which today cannot
+    // occur, but defensive-cmp is cheap) cannot panic the summary line.
     let (best_throughput, best_throughput_stats) = &stats_map[0];
     let best_latency = stats_map
         .iter()
-        .min_by(|a, b| {
-            a.1.latency_p99_median
-                .partial_cmp(&b.1.latency_p99_median)
-                .unwrap()
-        })
+        .min_by(|a, b| a.1.latency_p99_median.total_cmp(&b.1.latency_p99_median))
         .unwrap();
     let best_consistency = stats_map
         .iter()
         .min_by(|a, b| {
             let a_amp = a.1.latency_p99_median - a.1.latency_p50_median;
             let b_amp = b.1.latency_p99_median - b.1.latency_p50_median;
-            a_amp.partial_cmp(&b_amp).unwrap()
+            a_amp.total_cmp(&b_amp)
         })
         .unwrap();
 
@@ -1027,7 +1118,7 @@ async fn main() {
     let multi = stats_map.len() > 1;
     let w = |s: &str| -> String {
         if multi {
-            format!("{BOLD}{GREEN}{s}{RESET}")
+            format!("{}{}{}{}", bold(), green(), s, reset())
         } else {
             s.to_string()
         }