From a64923fcfabe2a230b5fa4c3bda8ef988d330514 Mon Sep 17 00:00:00 2001 From: sam-obeid Date: Mon, 4 May 2026 11:45:24 -0400 Subject: [PATCH] Public-release hardening: drop neocache, reproducibility & CI fixes Single squashed commit covering everything needed to make a fresh public clone build, lint, test and release-build green from a CI runner with no Shopify git credentials. Drop the neocache benchmark from the public repo - Remove the `Shopify/neocache` git dependency from Cargo.toml and its package stanza + cache-bench dep entry from Cargo.lock. - Delete src/caches/neocache.rs. - Remove neocache from ALL_CACHES and the dispatch arm in src/main.rs, and from the AVAILABLE CACHES / Available caches blocks in --help and --info output. - Replace the [neocache] row in the README comparison table with a 'Note on neocache' explainer documenting why it is absent and how to re-add it on a private overlay branch. Rationale: cargo always fetches git sources referenced by Cargo.lock, even for `optional = true` deps with the feature disabled, so feature-gating cannot fix the auth failure on public CI runners. Verified empirically: `cargo clippy --all-targets --locked -- -D warnings` now passes from a fresh CARGO_HOME with no git creds (the exact public-CI scenario), where it previously failed with 'failed to authenticate when downloading repository'. CI workflow + supply-chain hardening - .github/workflows/ci.yml: pin all third-party actions to 40-char SHAs (with version comment for Dependabot), lock GITHUB_TOKEN to `contents: read`, build the cargo cache key from OS+toolchain+ Cargo.lock with a toolchain-scoped restore-key, fail-fast off, and add a `cargo audit --deny warnings` job with a cached cargo-audit binary. - .cargo/audit.toml: explicitly-accepted advisories with rationale and review dates; new advisories fail the build by default. - .github/CODEOWNERS: ownership for review routing. - Cargo.toml: `rust-version = "1.94"` MSRV, `publish = false` (the `cache-bench` name is unclaimed on crates.io and we have no intent to claim it), readme/keywords/categories metadata. Reproducibility plumbing - generate_value_pool now takes the base RNG seed and derives a deterministic StdRng from it (with a 0xC0FFEE offset to avoid colliding with per-task seeds), so re-running with the same --seed produces bit-identical value bytes. - generate_fixed_value takes the RNG by reference instead of using an unseeded thread RNG. - calculate_statistics: f64::total_cmp instead of partial_cmp so a stray NaN sorts deterministically rather than panicking. - BenchConfig::cold_start docs spell out that it is forced `true` during the calibration pass. - Document the fresh-key window wraparound for write_new_key_fraction > 0 in total_benchmark_keys. CLI polish - ANSI colour escapes are now emitted only when stdout is a TTY (cached behind OnceLock); piping benchmark output to a file or CI log no longer produces literal \x1b[...m sequences. Docs - README: refreshed benchmark-output screenshot URL (kept the new user-attachments image from #6 over the in-repo placeholder that referenced a not-yet-committed file). - CONTRIBUTING.md: clarifications. - docs/assets/README.md: contributor guidance scaffolding. Verification (local, all green): - cargo fmt --check - cargo clippy --all-targets --locked -- -D warnings (also from a fresh empty CARGO_HOME with no git creds) - cargo build --locked - cargo build --release --locked - cargo test --locked (20 passed) --- .cargo/audit.toml | 29 +++++ .github/CODEOWNERS | 27 +++++ .github/workflows/ci.yml | 48 +++++--- CONTRIBUTING.md | 9 +- Cargo.lock | 13 --- Cargo.toml | 23 +++- README.md | 136 +++++++++++++++------- docs/assets/README.md | 26 +++++ src/caches/minimoka.rs | 8 +- src/caches/mod.rs | 71 ++++++++++-- src/caches/neocache.rs | 157 -------------------------- src/main.rs | 235 +++++++++++++++++++++++++++------------ 12 files changed, 471 insertions(+), 311 deletions(-) create mode 100644 .cargo/audit.toml create mode 100644 .github/CODEOWNERS create mode 100644 docs/assets/README.md delete mode 100644 src/caches/neocache.rs diff --git a/.cargo/audit.toml b/.cargo/audit.toml new file mode 100644 index 0000000..f8ef703 --- /dev/null +++ b/.cargo/audit.toml @@ -0,0 +1,29 @@ +# cargo-audit configuration. +# +# CI runs `cargo audit --deny warnings` so any *new* advisory (vulnerability, +# unsoundness, or unmaintained crate) fails the build. The entries below are +# the explicitly-accepted exceptions: each one must have a tracking note and +# should be revisited on every cargo bump. +# +# When adding an entry: include the advisory ID, a one-line reason, and the +# date it was accepted. When removing: confirm the upstream fix shipped and +# `cargo audit` returns clean without the ignore. + +[advisories] +ignore = [ + # RUSTSEC-2024-0436 — `paste` 1.0.15 is unmaintained. + # Pulled in transitively via foyer 0.22 -> foyer-memory -> paste. No direct + # use; only its proc-macro is invoked at build time. No exposed API surface + # at runtime. Re-evaluate when foyer drops paste or migrates to pastey. + # Accepted: 2026-05. + "RUSTSEC-2024-0436", + + # RUSTSEC-2026-0097 — `rand` is unsound when a custom global logger calls + # `rand::rng()` from inside its log handler. We affect three rand versions + # (0.8.5 via stretto/caches, 0.9.2 via foyer/twox-hash, 0.10.0 direct). + # This benchmark does not install a custom `log` handler and does not call + # `rand::rng()` from any logger path, so the unsoundness condition cannot + # be triggered here. Re-evaluate when transitive deps move to a fixed rand. + # Accepted: 2026-05. + "RUSTSEC-2026-0097", +] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..40d01a3 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,27 @@ +# CODEOWNERS for rust-cache-benchmarks +# +# GitHub auto-assigns the listed teams as reviewers on every PR (including +# Dependabot bumps). Order matters: the *last* matching pattern wins. +# +# All paths below are owned by @Shopify/rust-cache-benchmarks-maintainers. +# If that team is renamed, retired, or split, update every line in this file +# in the same commit — GitHub silently no-ops entries that point at unknown +# teams, which would leave PRs without a required reviewer. + +# Default owner for everything in the repo. +* @Shopify/rust-cache-benchmarks-maintainers + +# Benchmark methodology and statistical reporting are the highest-trust +# surface in this repo. Any change here affects the public comparison and +# should get an extra reviewer who has context on the published numbers. +/src/main.rs @Shopify/rust-cache-benchmarks-maintainers +/src/caches/mod.rs @Shopify/rust-cache-benchmarks-maintainers +/README.md @Shopify/rust-cache-benchmarks-maintainers +/CONTRIBUTING.md @Shopify/rust-cache-benchmarks-maintainers + +# Security-sensitive files: anyone changing the audit allowlist or the CI +# pipeline should get a security-aware review. +/SECURITY.md @Shopify/rust-cache-benchmarks-maintainers +/.cargo/audit.toml @Shopify/rust-cache-benchmarks-maintainers +/.github/workflows/ @Shopify/rust-cache-benchmarks-maintainers +/.github/CODEOWNERS @Shopify/rust-cache-benchmarks-maintainers diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a58dc50..2098222 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,28 +15,44 @@ env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 +# All third-party actions are pinned to a 40-char commit SHA. The trailing +# comment is the human-readable version Dependabot uses to keep the SHA up to +# date (see .github/dependabot.yml — ecosystem `github-actions`). Never +# downgrade these to floating tags or branches such as `@v4` or `@master`: a +# compromise of the upstream action repo would otherwise execute attacker code +# inside this workflow with the GITHUB_TOKEN scope. jobs: check: - name: Check - runs-on: ubuntu-latest + name: Check (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + # Don't let a flake on one OS hide a real bug on the other. + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@master + uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master @ 2026-05 with: toolchain: "1.94" components: rustfmt, clippy - name: Cache cargo registry - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 with: path: | ~/.cargo/registry ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: ${{ runner.os }}-cargo- + # Keys are scoped by OS, toolchain, and Cargo.lock. The fallback + # restore-key is also scoped by toolchain so a cache from a + # different Rust version is never partially mounted on top of + # this build (which has historically caused mysterious link + # errors on macOS runners). + key: ${{ runner.os }}-rust-1.94-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-rust-1.94-cargo- - name: Check formatting run: cargo fmt --check @@ -57,18 +73,20 @@ jobs: name: Audit runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@master + uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master @ 2026-05 with: toolchain: "1.94" - name: Cache cargo-audit binary - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 with: path: ~/.cargo/bin/cargo-audit - key: ${{ runner.os }}-cargo-audit-bin-v1 + # Cache key includes toolchain so audit binaries built against + # an older Rust never silently land in a newer pipeline. + key: ${{ runner.os }}-rust-1.94-cargo-audit-bin-v1 - name: Install cargo-audit run: | @@ -77,6 +95,8 @@ jobs: fi - name: Audit dependencies - # cargo audit fails on vulnerabilities by default. Warnings (unmaintained - # crates, soundness advisories) are reported but do not fail the job. - run: cargo audit + # `--deny warnings` makes any *new* unmaintained or unsoundness + # advisory fail the build. Currently-accepted advisories are listed + # explicitly in .cargo/audit.toml with rationale and review dates; + # adding a new ignore there is a deliberate, reviewable code change. + run: cargo audit --deny warnings diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e306876..ebe9170 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,7 +33,8 @@ Thanks for your interest in contributing! `rust-cache-benchmarks` is a benchmark ## Local checks -CI runs against Rust `1.94`. All of these must pass: +CI runs against Rust `1.94` (the project's MSRV — see `rust-toolchain.toml` +and `Cargo.toml`'s `rust-version`). All of these must pass: ```sh cargo fmt --check @@ -41,6 +42,12 @@ cargo clippy --all-targets --locked -- -D warnings cargo build --locked cargo build --release --locked cargo test --locked + +# Supply-chain audit. CI runs this with `--deny warnings`, so any *new* +# unmaintained or unsoundness advisory will fail the pipeline. Currently +# accepted advisories are listed in .cargo/audit.toml with rationale. +# `cargo install --locked cargo-audit` if you don't have it locally. +cargo audit --deny warnings ``` ## Running benchmarks diff --git a/Cargo.lock b/Cargo.lock index acc0243..331f46b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -151,7 +151,6 @@ dependencies = [ "lru-mem", "mini-moka", "moka", - "neocache", "parking_lot", "quick_cache", "rand 0.10.0", @@ -1070,18 +1069,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "neocache" -version = "0.1.0" -source = "git+https://github.com/Shopify/neocache?rev=3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c#3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c" -dependencies = [ - "ahash", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "parking_lot_core", -] - [[package]] name = "num-traits" version = "0.2.19" diff --git a/Cargo.toml b/Cargo.toml index 96e537c..3b3b0b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,11 +2,22 @@ name = "cache-bench" version = "0.1.0" edition = "2021" +# MSRV. Kept in sync with `rust-toolchain.toml` and the toolchain pinned in +# `.github/workflows/ci.yml`. Bumping any of the three requires bumping the +# other two. +rust-version = "1.94" description = "Concurrent in-memory cache benchmarks for Rust" license = "MIT" +readme = "README.md" repository = "https://github.com/Shopify/rust-cache-benchmarks" homepage = "https://github.com/Shopify/rust-cache-benchmarks" authors = ["Shopify Inc."] +keywords = ["cache", "benchmark", "lru", "concurrent", "performance"] +categories = ["caching", "development-tools::profiling"] +# This is a benchmark harness, not a library. Block accidental `cargo publish` +# from a maintainer's machine or a future release CI — the `cache-bench` name +# is currently free on crates.io and we do not intend to claim it. +publish = false [dependencies] schnellru = "0.2.4" @@ -27,13 +38,17 @@ clru = "0.6.3" lru-mem = "0.3.0" sieve-cache = "1.1.6" caches = "0.3.0" -# Pinned to a commit SHA so `cargo update` cannot silently roll the -# benchmark forward to a future neocache release with different -# performance characteristics. Bump deliberately when re-baselining. -neocache = { git = "https://github.com/Shopify/neocache", rev = "3c87b8fde4fab6eb3e9285c05df5ab8ae0cf498c" } rand_distr = "0.6" parking_lot = "0.12" +# `neocache` is intentionally not a dependency of this public repository. +# The upstream crate lives in a private Shopify repo, so any reference to +# it (even an `optional = true` git dependency) is recorded in `Cargo.lock` +# and forces `cargo` to fetch the source on every build — which fails with +# `failed to authenticate when downloading repository` on any clone +# without Shopify git credentials, including public CI runners. See the +# `Per-cache configuration deviations` section of `README.md`. + [profile.release] opt-level = 3 lto = "fat" diff --git a/README.md b/README.md index a926731..533e2e4 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a image - ### Bachmark machine specs: | Component | Details | @@ -25,26 +24,80 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a ## Caches compared -| Crate | Strategy | -|---|---| -| [moka](https://crates.io/crates/moka) | TinyLFU / W-TinyLFU | -| [mini-moka](https://crates.io/crates/mini-moka) | TinyLFU (lighter weight) | -| [quick_cache](https://crates.io/crates/quick_cache) | LRU / CLOCK-Pro | -| [schnellru](https://crates.io/crates/schnellru) | LRU | -| [stretto](https://crates.io/crates/stretto) | Ristretto (TinyLFU) | -| [lru](https://crates.io/crates/lru) | LRU | -| [TinyUFO](https://crates.io/crates/TinyUFO) | TinyUFO | -| [foyer](https://crates.io/crates/foyer) | S3-FIFO | -| [cached](https://crates.io/crates/cached) | LRU (proc-macro) | -| [clru](https://crates.io/crates/clru) | Count-Min LRU | -| [lru-mem](https://crates.io/crates/lru-mem) | Memory-bounded LRU | -| [sieve-cache](https://crates.io/crates/sieve-cache) | SIEVE | -| [caches](https://crates.io/crates/caches) | Two-Queue | -| [neocache](https://github.com/Shopify/neocache) | S3-FIFO + DashMap | +The `Concurrency` column is a critical fairness disclosure: throughput numbers +are dominated by lock topology as much as by eviction policy. Caches marked +*sharded* serialise only within their shard; caches marked *single mutex* are +wrapped by this benchmark in a `parking_lot::Mutex` because the underlying +crate is not internally `Sync`-safe for concurrent writers. Compare +like-for-like before drawing conclusions. + +| Crate | Strategy | Concurrency | +|---|---|---| +| [moka](https://crates.io/crates/moka) | TinyLFU / W-TinyLFU | sharded (built-in) | +| [mini-moka](https://crates.io/crates/mini-moka) | TinyLFU (lighter weight) | sharded (built-in) | +| [quick_cache](https://crates.io/crates/quick_cache) | LRU / CLOCK-Pro | sharded (built-in) | +| [schnellru](https://crates.io/crates/schnellru) | LRU | single `Mutex` (this harness) | +| [stretto](https://crates.io/crates/stretto) | Ristretto (TinyLFU) | sharded (built-in) - see footnote † | +| [lru](https://crates.io/crates/lru) | LRU | single `Mutex` (this harness) | +| [TinyUFO](https://crates.io/crates/TinyUFO) | TinyUFO | sharded (built-in) | +| [foyer](https://crates.io/crates/foyer) | S3-FIFO | sharded (built-in) | +| [cached](https://crates.io/crates/cached) | LRU (proc-macro) | single `Mutex` (this harness) | +| [clru](https://crates.io/crates/clru) | Count-Min LRU | single `Mutex` (this harness) | +| [lru-mem](https://crates.io/crates/lru-mem) | Memory-bounded LRU | single `Mutex` (this harness) - see footnote ‡ | +| [sieve-cache](https://crates.io/crates/sieve-cache) | SIEVE | sharded (`ShardedSieveCache`) | +| [caches](https://crates.io/crates/caches) | Two-Queue | single `Mutex` (this harness) — see footnote § | + +> **Note on `neocache`.** Earlier internal revisions of this harness also +> benchmarked Shopify's in-house `neocache` crate (S3-FIFO + DashMap, built-in +> sharding). It is intentionally absent from the public release: `neocache` +> lives in a private repository, and even an `optional = true` git +> dependency forces `cargo` to record the source in `Cargo.lock` and fetch +> it on every build, which fails for any clone without Shopify git +> credentials (including public CI runners). Reintroducing it requires a +> private overlay branch that adds the dependency, the +> `src/caches/neocache.rs` module, and matching entries in `ALL_CACHES` and +> `dispatch` in `src/main.rs`. + +### Per-cache configuration deviations + +Every cache is constructed with default parameters *unless* listed here. Anything +on this list is a deliberate choice we have made on behalf of the cache; the +rationale is in the matching `src/caches/.rs` file. If you believe one of +these tunings biases the comparison, please file a methodology issue with a +before/after diff. + +- **† stretto** - constructed with `num_counters = 10× cache_size`, + `set_ignore_internal_cost(true)`, and `set_buffer_size(num_tasks × 4 KiB)`. + Stretto additionally receives a unique 3-phase warmup (insert → read pass to + populate TinyLFU frequency counts → re-insert with TinyLFU-informed admission) + because its frequency sketch is updated only from reads, never from inserts; + a single-phase warmup leaves admission decisions effectively random and + collapses the hit rate to ~45%. +- **‡ lru-mem** - capacity is byte-budgeted (`cache_size × value_size`). The + budget covers value bytes only; `String` headers and key-string storage are + not accounted for, so the effective entry count slightly exceeds + `cache_size`. Documented for transparency; correcting it would require + bespoke `HeapSize` accounting that other caches do not model. +- **§ caches (Two-Queue)** - the [`caches`](https://crates.io/crates/caches) + crate has not seen a release since 2022 and pulls in `rand 0.8.5`. Included + for historical comparison; a future PR may retire it once a maintained + alternative exists. +- **mini-moka** - unlike moka, mini-moka 0.10 does not expose a public flush + analogous to `run_pending_tasks()` (the equivalent `sync()` is private), so + warmup completion is not synchronous on this cache. The first measurement + iteration absorbs the residual eviction work; later iterations measure + steady state. +- **lrucache, schnellru, clru, cached, two-queue, lru-mem** - wrapped in a + single `parking_lot::Mutex` because none of them is `Sync` for concurrent + writers. This makes them strictly worse than sharded caches under the + default 8-thread workload; the comparison is still meaningful as a + *deployment* signal ("do not use these from multiple writer threads"), not + as a pure algorithmic ranking. ## Requirements -- Rust toolchain (pinned in `rust-toolchain.toml` — installed automatically via `rustup`) +- Rust toolchain (MSRV 1.94, pinned in `rust-toolchain.toml`, `Cargo.toml`'s + `rust-version`, and the CI workflow). Installed automatically via `rustup`. - For best results, run on a quiet machine (no other heavy processes) ## Build and run @@ -54,7 +107,7 @@ Concurrent in-memory cache benchmarks for Rust. Compares throughput, hit rate, a cargo run --release # Run a subset -cargo run --release -- --caches moka,quick_cache,neocache +cargo run --release -- --caches moka,quick_cache,sieve # Override config parameters at runtime (no recompile needed) cargo run --release -- --size 10000 --zipf 0.9 @@ -76,19 +129,19 @@ cargo run --release -- --info | Column | Meaning | |---|---| -| `ops/sec` | Median throughput across all measurement iterations — total operations (reads + writes) per wall-clock second | +| `ops/sec` | Median throughput across all measurement iterations - total operations (reads + writes) per wall-clock second | | `±ci95` | 95% CI half-width as a percentage of the mean throughput (uses the actual `n` for this cache). Two caches whose CI ranges don't overlap are distinguishable at the 95% level | | `cv%` | Coefficient of variation (`stddev / mean × 100`). < 3% = stable; > 10% = noisy, flagged with `!` | | `n` | Actual number of measurement iterations run. Adaptive early stopping may converge before the 30-iteration maximum | -| `p50µs` | Median of per-iteration p50 latencies in µs — the typical latency of a representative single run | -| `p99µs` | Median of per-iteration p99 latencies in µs — avoids the pooling bias of aggregating all iterations before computing percentiles | -| `tailµs` | Tail amplification = `p99µs − p50µs` (both medians) in µs. Lower = more consistent per-op cost | -| `p99cv%` | CV% of per-iteration p99 latencies — measures tail latency *stability*; high = occasional spikes invisible in the median | -| `wp99µs` | Median of per-iteration **write-only** p99 latencies in µs. Isolates write cost from the read-dominated `p99µs`; especially diagnostic for mutex-backed caches where writes hold an exclusive lock. Shows `--` if no write samples exist | -| `hit%` | Cache hit rate — fraction of reads that found the key; purely a function of eviction policy and access pattern | +| `p50μs` | Median of per-iteration p50 latencies in μs - the typical latency of a representative single run | +| `p99μs` | Median of per-iteration p99 latencies in μs - avoids the pooling bias of aggregating all iterations before computing percentiles | +| `tailμs` | Tail amplification = `p99μs - p50μs` (both medians) in μs. Lower = more consistent per-op cost | +| `p99cv%` | CV% of per-iteration p99 latencies - measures tail latency *stability*; high = occasional spikes invisible in the median | +| `wp99μs` | Median of per-iteration **write-only** p99 latencies in μs. Isolates write cost from the read-dominated `p99μs`; especially diagnostic for mutex-backed caches where writes hold an exclusive lock. Shows `--` if no write samples exist | +| `hit%` | Cache hit rate - fraction of reads that found the key; purely a function of eviction policy and access pattern | | `±hitci` | 95% CI half-width for hit rate in absolute percentage points (e.g. `±0.020` = ±0.020 pp). Three decimal places avoids the misleading `±0.00` display when CI is very tight but nonzero | -> **Note on latency floor:** Each sampled latency is bracketed by two TSC reads (~10–20 ns each on Linux via vDSO). The combined ~20–40 ns overhead inflates `p50µs`/`p99µs` for the fastest caches. Use latency numbers for relative comparison, not absolute wall-clock cost. +> **Note on latency floor:** Each sampled latency is bracketed by two TSC reads (~10-20 ns each on Linux via vDSO). The combined ~20-40 ns overhead inflates `p50μs`/`p99μs` for the fastest caches. Use latency numbers for relative comparison, not absolute wall-clock cost. ## Configuration @@ -103,7 +156,7 @@ All parameters have sensible defaults and can be overridden via CLI flags withou | `value_size` | `--value-size` | 5,120 B | 5 KB cached values | | `num_iterations` | `--iters` | 30 (max) | Hard cap; adaptive early stopping may converge sooner | | `warmup_iterations` | `--warmup` | 1 | One ~3 s pass primes branch predictors and engages Turbo Boost | -| `zipf_exponent` | `--zipf` | 1.07 | ≈ 80/20 hotspot skew; sweep 0.8–1.5 | +| `zipf_exponent` | `--zipf` | 1.07 | ≈ 80/20 hotspot skew; sweep 0.8-1.5 | | `access_pattern` | `--pattern` | zipfian | `zipfian`, `uniform`, or `sequential` | | `cold_start` | `--cold-start` | false | Start with empty cache | | `latency_sample_every` | `--sample-every` | 10 | Bernoulli sampling rate (~10% of ops) | @@ -113,17 +166,22 @@ All parameters have sensible defaults and can be overridden via CLI flags withou ## Benchmark methodology -- **Task-parallel, OS-thread model** — each task runs in a `tokio::task::spawn_blocking` thread for true parallelism -- **Barrier-synchronized start** — all tasks start simultaneously; throughput = `total_ops / wall_time_of_slowest_task` -- **Per-cache ops calibration** — a 200 K-op/task calibration pass before warmup measures each cache's throughput and sets `num_keys_per_task = clamp(3 s × throughput / num_tasks, 500 K, 2 M)`, targeting ~3 s wall time per iteration; prevents slow global-mutex caches from dominating total runtime -- **Per-iteration randomised ordering** — the remaining active caches are shuffled each iteration to eliminate thermal/position bias -- **Adaptive early stopping** — after each iteration, any cache with ≥ 15 samples and 95% CI < 0.75% of mean is retired; the `n` column shows actual iterations run -- **Median throughput** — robust to single-iteration outliers from thermal throttling or OS scheduling -- **95% confidence intervals** — reported on both throughput (`±ci95`) and hit rate (`±hitci`) -- **Per-iteration latency percentiles** — p50/p99 computed per iteration then summarised with the median; avoids the pooling bias of aggregating all samples before computing percentiles -- **Bernoulli latency sampling** — each op is sampled with probability 1/N from the task's existing RNG; avoids phase-locking with periodic cache slow paths that deterministic modulo sampling can miss -- **Noisy-run flagging** — throughput CV% > 10 is marked with `!` in the output table; a footnote is printed below the table when triggered +- **Task-parallel, OS-thread model** - each task runs in a `tokio::task::spawn_blocking` thread for true parallelism +- **Barrier-synchronized start** - all tasks start simultaneously; throughput = `total_ops / wall_time_of_slowest_task` +- **Per-cache ops calibration** - a 200 K-op/task calibration pass before warmup measures each cache's throughput and sets `num_keys_per_task = clamp(3 s × throughput / num_tasks, 500 K, 2 M)`, targeting ~3 s wall time per iteration; prevents slow global-mutex caches from dominating total runtime +- **Per-iteration randomised ordering** - the remaining active caches are shuffled each iteration to eliminate thermal/position bias +- **Adaptive early stopping** - after each iteration, any cache with ≥ 15 samples and 95% CI < 0.75% of mean is retired; the `n` column shows actual iterations run +- **Median throughput** - robust to single-iteration outliers from thermal throttling or OS scheduling +- **95% confidence intervals** - reported on both throughput (`±ci95`) and hit rate (`±hitci`) +- **Per-iteration latency percentiles** - p50/p99 computed per iteration then summarised with the median; avoids the pooling bias of aggregating all samples before computing percentiles +- **Bernoulli latency sampling** - each op is sampled with probability 1/N from the task's existing RNG; avoids phase-locking with periodic cache slow paths that deterministic modulo sampling can miss +- **Noisy-run flagging** - throughput CV% > 10 is marked with `!` in the output table; a footnote is printed below the table when triggered + +## Reporting security issues + +Please do not open public issues for security reports. See [SECURITY.md](SECURITY.md) +for private disclosure channels. ## License -MIT +Licensed under the [MIT License](LICENSE). diff --git a/docs/assets/README.md b/docs/assets/README.md new file mode 100644 index 0000000..1c993bd --- /dev/null +++ b/docs/assets/README.md @@ -0,0 +1,26 @@ +# Assets + +This directory holds README assets that need to be versioned with the repo +(screenshots, charts, banners) so they cannot break independently of the code. + +## Currently expected files + +| Path | Purpose | +|---|---| +| `benchmark-output.png` | Hero screenshot of `cargo run --release` output, referenced from the top of `README.md`. | + +## How to refresh the screenshot + +1. On a quiet machine matching the spec table in `README.md`, run: + ```sh + cargo run --release > /tmp/bench.txt + ``` +2. Take a screenshot of the rendered table (the binary uses ANSI escapes for + colour, so a terminal screenshot is more readable than the raw text). +3. Save as `docs/assets/benchmark-output.png`. Keep it under ~500 KB; resize + to a reasonable display width (1300\u20131600 px is fine). +4. Commit. The README link is a relative path, so no further changes needed. + +Do **not** re-link the README to `github.com/user-attachments/...` URLs: +those are tied to a single uploader and disappear if the user is removed +or the upload is purged. diff --git a/src/caches/minimoka.rs b/src/caches/minimoka.rs index 41774f8..f48a955 100644 --- a/src/caches/minimoka.rs +++ b/src/caches/minimoka.rs @@ -20,8 +20,12 @@ pub async fn run_bench( let value = value_pool[key_idx % value_pool.len()].clone(); cache.insert(key_pool[key_idx].clone(), value); } - // Flush deferred eviction decisions before measurement, matching moka's - // run_pending_tasks() call and ensuring a consistent steady-state start. + // mini-moka 0.10 does NOT expose a public flush analogous to + // moka::sync::Cache::run_pending_tasks(); the equivalent `sync()` is + // private. Pending eviction work may therefore still be in-flight when + // measurement begins. The first warmup iteration absorbs most of it, + // but mini-moka's read of `entry_count` shortly after warmup may lag + // behind moka's. Documented here so the asymmetry is not silent. } let cache = Arc::new(cache); diff --git a/src/caches/mod.rs b/src/caches/mod.rs index 10535e0..b9848be 100644 --- a/src/caches/mod.rs +++ b/src/caches/mod.rs @@ -11,7 +11,12 @@ pub mod lrucache; pub mod lrumem; pub mod minimoka; pub mod moka; -pub mod neocache; +// `neocache` is intentionally not benchmarked from this public repo: +// the upstream crate lives in a private Shopify repo and so cannot be +// cloned by external users or CI runners. See the comment in +// `Cargo.toml` and the README. To benchmark `neocache` internally, +// keep the dependency, this `pub mod`, and the corresponding entry in +// `ALL_CACHES`/`dispatch` in a private overlay branch. pub mod quick_cache; pub mod schnellru; pub mod sieve; @@ -54,7 +59,14 @@ pub struct BenchConfig { pub access_pattern: AccessPattern, /// Zipf skew exponent: 1.07 ≈ 80/20 hotspot; higher = more skewed pub zipf_exponent: f64, - /// Skip pre-population warmup to measure cold ramp-up; false = steady-state + /// Skip pre-population warmup to measure cold ramp-up; false = steady-state. + /// + /// Also set internally to `true` for the calibration pass so that + /// calibration measures pure hot-loop throughput, not + /// `(pre-pop cost + hot-loop) / wall_time` — caches with expensive + /// pre-pop (e.g. stretto's 3-phase warmup, lru-mem's per-byte tracking) + /// would otherwise be under-calibrated and assigned a smaller + /// `num_keys_per_task` than is actually warranted. pub cold_start: bool, /// Minimum key string length (zero-padded); 0 = natural numeric length pub key_size: usize, @@ -79,9 +91,13 @@ pub struct BenchResults { pub write_latencies_ns: Vec, // Sampled write-only latencies in nanoseconds } -pub fn generate_fixed_value(size: usize) -> String { - rand::rng() - .sample_iter(&rand::distr::Alphanumeric) +/// Generate a single random alphanumeric string of `size` bytes from the +/// caller-supplied RNG. Taking the RNG by reference lets the value pool be +/// generated deterministically from the benchmark's base seed instead of from +/// an unseeded thread RNG — a precondition of the project's reproducibility +/// guarantee documented in `--info`. +pub fn generate_fixed_value(rng: &mut impl rand::Rng, size: usize) -> String { + rng.sample_iter(&rand::distr::Alphanumeric) .take(size) .map(char::from) .collect() @@ -89,9 +105,24 @@ pub fn generate_fixed_value(size: usize) -> String { /// Pre-allocates a pool of values to eliminate allocation overhead during benchmarking. /// Uses Arc to avoid expensive clones — only the Arc is cloned (8 bytes), not the data. -pub fn generate_value_pool(pool_size: usize, value_size: usize) -> Arc>> { +/// +/// `seed` is taken from `BenchConfig::rng_seed`, with a fixed offset so it +/// cannot collide with the per-task seeds derived as `(rng_seed + task_id + 1)`. +/// Re-running with the same seed therefore produces bit-identical value bytes, +/// which is what `--info` claims and what before/after diff comparisons rely on. +pub fn generate_value_pool( + pool_size: usize, + value_size: usize, + seed: u64, +) -> Arc>> { + // Offset chosen to leave the [seed, seed + num_tasks] range free for the + // per-task RNGs in every cache implementation. 0xC0FFEE is arbitrary + // — the only requirement is that it is large enough to not collide with + // realistic task counts. + const VALUE_POOL_SEED_OFFSET: u64 = 0x00C0_FFEE; + let mut rng = rand::rngs::StdRng::seed_from_u64(seed.wrapping_add(VALUE_POOL_SEED_OFFSET)); let pool: Vec> = (0..pool_size) - .map(|_| Arc::new(generate_fixed_value(value_size))) + .map(|_| Arc::new(generate_fixed_value(&mut rng, value_size))) .collect(); Arc::new(pool) } @@ -120,7 +151,27 @@ pub fn generate_key_pool(num_keys: usize, key_size: usize) -> Arc> { } /// Returns the number of primary benchmark keys (the Zipf keyspace). -/// The full key pool is 2× this value to accommodate write_new_key_fraction. +/// The full key pool is 2× this value to accommodate `write_new_key_fraction`. +/// +/// ## Fresh-key window (write_new_key_fraction > 0) +/// +/// Each cache implementation computes a fresh-key index as: +/// +/// ```text +/// idx = total_keys + ((seq_start + fresh_write_count) % total_keys) +/// ``` +/// +/// so `idx` lives in `[total_keys, 2 × total_keys)`. Because the modulus is +/// `total_keys` (not `total_keys * num_tasks`), two tasks with identical +/// `seq_start mod total_keys` will rotate through the **same** fresh-key +/// indices. With the default 8 tasks × 2 M ops over a 480 K Zipf keyspace, +/// each task wraps the fresh-key window every ~60 K ops. +/// +/// This is intentional: the goal of `write_new_key_fraction` is to model +/// insert-heavy workloads (event ingestion, telemetry firehoses) by injecting +/// keys outside the Zipf hot-set, not to allocate a fully unique fresh-key +/// stream per task. Documenting it here so the bound on the fresh-key pool +/// is auditable and the wraparound is not surprising. pub fn total_benchmark_keys(cfg: &BenchConfig) -> usize { cfg.num_distinct_keys } @@ -223,7 +274,9 @@ pub fn calculate_statistics(values: &[f64]) -> (f64, f64, f64, f64) { let mean = values.iter().sum::() / values.len() as f64; let mut sorted = values.to_vec(); - sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + // total_cmp instead of partial_cmp so a stray NaN sorts deterministically + // to one end rather than panicking the whole benchmark summary. + sorted.sort_by(|a, b| a.total_cmp(b)); let n = sorted.len(); let median = if n.is_multiple_of(2) { (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0 diff --git a/src/caches/neocache.rs b/src/caches/neocache.rs deleted file mode 100644 index f4c0025..0000000 --- a/src/caches/neocache.rs +++ /dev/null @@ -1,157 +0,0 @@ -use crate::caches::{total_benchmark_keys, BenchConfig, BenchResults, KeyGenerator}; -use neocache::NeoCache; -use rand::{RngExt, SeedableRng}; -use std::sync::Arc; -use std::time::Instant; - -/// NeoCache: concurrent HashMap with built-in S3-FIFO cache eviction. -/// Unlike the DashMap benchmark, no manual entry_count tracking or random -/// eviction is required — S3-FIFO eviction runs automatically per-shard on -/// every insert once the cache reaches capacity. -pub async fn run_bench( - cfg: Arc, - value_pool: Arc>>, - key_pool: Arc>, -) -> BenchResults { - let cache: Arc>> = Arc::new(NeoCache::new(cfg.cache_size)); - let total_keys = total_benchmark_keys(&cfg); - - if !cfg.cold_start { - let warmup_key_gen = KeyGenerator::new(cfg.access_pattern, total_keys, cfg.zipf_exponent); - let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.rng_seed); - for i in 0..total_keys { - let key_idx = warmup_key_gen.next_key(i, &mut rng); - let value = value_pool[key_idx % value_pool.len()].clone(); - cache.insert(key_pool[key_idx].clone(), value); - } - // No eviction trim needed: S3-FIFO enforces cache_capacity automatically. - } - - let key_gen = Arc::new(KeyGenerator::new( - cfg.access_pattern, - total_keys, - cfg.zipf_exponent, - )); - let barrier = Arc::new(std::sync::Barrier::new(cfg.num_tasks)); - - let tasks: Vec<_> = (0..cfg.num_tasks) - .map(|i| { - let my_cache = cache.clone(); - let value_pool = value_pool.clone(); - let key_pool = key_pool.clone(); - let key_gen = key_gen.clone(); - let seq_start = i * cfg.num_keys_per_task; - let seq_end = (i + 1) * cfg.num_keys_per_task; - let cfg = cfg.clone(); - let barrier = barrier.clone(); - - tokio::task::spawn_blocking(move || { - let mut local_latencies = - Vec::with_capacity(cfg.num_keys_per_task / cfg.latency_sample_every + 64); - let mut local_write_latencies = - Vec::with_capacity(cfg.num_keys_per_task / cfg.latency_sample_every / 5 + 64); - let mut local_reads = 0usize; - let mut local_hits = 0usize; - let mut fresh_write_count = 0usize; - let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.rng_seed + i as u64 + 1); - - barrier.wait(); - let task_start = Instant::now(); - - for seq_key in seq_start..seq_end { - let key_idx = key_gen.next_key(seq_key, &mut rng); - let k = &key_pool[key_idx]; - - let is_read = rng.random_bool(cfg.read_write_ratio); - let should_measure = rng.random_bool(1.0 / cfg.latency_sample_every as f64); - - if is_read { - local_reads += 1; - let op_start = if should_measure { - Some(Instant::now()) - } else { - None - }; - let hit = my_cache.get(k); - if let Some(t) = op_start { - local_latencies.push(t.elapsed().as_nanos() as u64); - } - if let Some(value) = hit { - local_hits += 1; - let checksum: u32 = value - .as_bytes() - .iter() - .step_by(256) - .map(|&b| b as u32) - .sum(); - std::hint::black_box(checksum); - } - } else { - let write_key_idx = if cfg.write_new_key_fraction > 0.0 - && rng.random_bool(cfg.write_new_key_fraction) - { - let idx = total_keys + ((seq_start + fresh_write_count) % total_keys); - fresh_write_count += 1; - idx - } else { - key_idx - }; - let k_write = &key_pool[write_key_idx]; - let value = value_pool[write_key_idx % value_pool.len()].clone(); - let op_start = if should_measure { - Some(Instant::now()) - } else { - None - }; - my_cache.insert(k_write.clone(), value); - if let Some(t) = op_start { - let ns = t.elapsed().as_nanos() as u64; - local_latencies.push(ns); - local_write_latencies.push(ns); - } - } - } - - ( - local_latencies, - local_write_latencies, - local_reads, - local_hits, - task_start.elapsed(), - ) - }) - }) - .collect(); - - let mut all_latencies = Vec::new(); - let mut all_write_latencies = Vec::new(); - let mut total_reads = 0usize; - let mut total_hits = 0usize; - let mut max_duration = std::time::Duration::ZERO; - for handle in tasks { - let (lat, wlat, reads, hits, duration) = handle.await.unwrap(); - all_latencies.extend(lat); - all_write_latencies.extend(wlat); - total_reads += reads; - total_hits += hits; - max_duration = max_duration.max(duration); - } - - let total_ops = cfg.num_tasks * cfg.num_keys_per_task; - let throughput = total_ops as f64 / max_duration.as_secs_f64(); - let hit_rate = if total_reads > 0 { - total_hits as f64 / total_reads as f64 - } else { - 0.0 - }; - - BenchResults { - throughput, - hit_rate, - total_ops, - total_reads, - total_hits, - latencies_ns: all_latencies, - write_latencies_ns: all_write_latencies, - } -} diff --git a/src/main.rs b/src/main.rs index fc10219..2458a08 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,16 +1,61 @@ use rand::{seq::SliceRandom, SeedableRng}; use std::collections::HashMap; +use std::io::IsTerminal; use std::sync::Arc; +use std::sync::OnceLock; use crate::caches::*; -// Raw ANSI sequences — no crate dependency. -const DIM: &str = "\x1b[2m"; -const RESET: &str = "\x1b[0m"; -const BOLD: &str = "\x1b[1m"; -const GREEN: &str = "\x1b[32m"; -const YELLOW: &str = "\x1b[33m"; -const CYAN: &str = "\x1b[36m"; +// ANSI colour escapes are written through these helpers so they collapse to +// empty strings when stdout is not a TTY (piped to a file, captured by CI). +// Writing the escape bytes to a non-terminal sink produces literal `\x1b[2m` +// in the output, which would be unhelpful for anyone diffing benchmark logs. +fn ansi_enabled() -> bool { + static CACHED: OnceLock = OnceLock::new(); + *CACHED.get_or_init(|| std::io::stdout().is_terminal()) +} +fn dim() -> &'static str { + if ansi_enabled() { + "\x1b[2m" + } else { + "" + } +} +fn reset() -> &'static str { + if ansi_enabled() { + "\x1b[0m" + } else { + "" + } +} +fn bold() -> &'static str { + if ansi_enabled() { + "\x1b[1m" + } else { + "" + } +} +fn green() -> &'static str { + if ansi_enabled() { + "\x1b[32m" + } else { + "" + } +} +fn yellow() -> &'static str { + if ansi_enabled() { + "\x1b[33m" + } else { + "" + } +} +fn cyan() -> &'static str { + if ansi_enabled() { + "\x1b[36m" + } else { + "" + } +} mod caches; @@ -31,7 +76,10 @@ const ALL_CACHES: &[&str] = &[ "lrumem", "sieve", "two_queue", - "neocache", + // `neocache` is deliberately not in this public list — see the + // comment in `caches/mod.rs` and `Cargo.toml` for why. Internal + // users carrying the overlay should add the entry back here in + // the same alphabetical-by-introduction order. ]; /// Dispatch a single benchmark run by cache name. @@ -55,7 +103,6 @@ async fn dispatch( "lrumem" => lrumem::run_bench(cfg, value_pool, key_pool).await, "sieve" => sieve::run_bench(cfg, value_pool, key_pool).await, "two_queue" => two_queue::run_bench(cfg, value_pool, key_pool).await, - "neocache" => neocache::run_bench(cfg, value_pool, key_pool).await, // Cache names are validated against `ALL_CACHES` in `parse_args`, so any // unknown name here indicates a programmer error (a name was added to // `ALL_CACHES` without a matching arm here). @@ -73,8 +120,11 @@ async fn run_benchmark_suite( let spinners = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; let mut spin_idx: usize = 0; - // Pre-allocate value pool once for all benchmarks - let value_pool = generate_value_pool(cfg.value_pool_size, cfg.value_size); + // Pre-allocate value pool once for all benchmarks. Seeded from the same + // base seed as the rest of the benchmark, so re-running with the same + // `--seed` produces bit-identical value bytes (matching the + // reproducibility guarantee printed by `--info`). + let value_pool = generate_value_pool(cfg.value_pool_size, cfg.value_size, cfg.rng_seed); let key_pool = generate_key_pool(total_keys * 2, cfg.key_size); let active: Vec<&'static str> = ALL_CACHES @@ -97,8 +147,15 @@ async fn run_benchmark_suite( const MIN_OPS: usize = 500_000; const MAX_OPS: usize = 2_000_000; + // Calibration runs with `cold_start = true` so we measure pure hot-loop + // throughput, not (pre-pop cost + hot-loop) / wall_time. Caches with + // expensive pre-population (stretto's 3-phase warmup, lru-mem's per-byte + // tracking) would otherwise be under-calibrated relative to caches with + // cheap pre-pop, giving them a smaller `num_keys_per_task` budget than + // their steady-state throughput justifies. let cal_cfg = Arc::new(BenchConfig { num_keys_per_task: CAL_OPS, + cold_start: true, ..(*cfg).clone() }); let mut per_cache_cfg: HashMap<&'static str, Arc> = HashMap::new(); @@ -114,7 +171,9 @@ async fn run_benchmark_suite( sp, cal_idx + 1, total_active, - name + name, + DIM = dim(), + RESET = reset(), ) ); let _ = std::io::Write::flush(&mut std::io::stdout()); @@ -145,7 +204,9 @@ async fn run_benchmark_suite( sp, i + 1, cfg.warmup_iterations, - name + name, + YELLOW = yellow(), + RESET = reset(), ) ); let _ = std::io::Write::flush(&mut std::io::stdout()); @@ -183,7 +244,15 @@ async fn run_benchmark_suite( spin_idx += 1; print!( "{:<65}\r", - format!(" {} {CYAN}{BOLD}run{RESET} {} {}...", sp, n, name) + format!( + " {} {CYAN}{BOLD}run{RESET} {} {}...", + sp, + n, + name, + CYAN = cyan(), + BOLD = bold(), + RESET = reset(), + ) ); let _ = std::io::Write::flush(&mut std::io::stdout()); let result = dispatch( @@ -216,38 +285,61 @@ fn check_help_flag() -> bool { std::env::args().any(|a| a == "--help" || a == "-h") } +/// Single source of truth for the user-facing flag table. +/// +/// Both `print_help` (`--help`) and `print_info` (`--info`) render from this +/// list so they cannot drift apart. When adding or renaming a flag, update +/// **only** this table plus the matching arm in `parse_args`. +struct FlagDoc { + /// Human-readable invocation form, e.g. `"--size N"` or `"--caches / -c LIST"`. + form: &'static str, + /// One-line description suitable for both the brief and full references. + /// Includes the default value where applicable. + desc: &'static str, +} + +#[rustfmt::skip] +const FLAG_TABLE: &[FlagDoc] = &[ + FlagDoc { form: "--size N", desc: "cache_size (default: 30000)" }, + FlagDoc { form: "--keys N", desc: "num_distinct_keys (default: 480000, i.e. 16× cache_size)" }, + FlagDoc { form: "--tasks N", desc: "num_tasks (default: available_parallelism)" }, + FlagDoc { form: "--iters N", desc: "num_iterations max (default: 30; adaptive stopping may run fewer)" }, + FlagDoc { form: "--warmup N", desc: "warmup_iterations (default: 1)" }, + FlagDoc { form: "--ratio F", desc: "read_write_ratio 0.0–1.0 (default: 0.80)" }, + FlagDoc { form: "--value-size N", desc: "value size in bytes (default: 5120)" }, + FlagDoc { form: "--zipf F", desc: "Zipf exponent (default: 1.07) sweep 0.8–1.5" }, + FlagDoc { form: "--pattern P", desc: "access pattern: zipfian|uniform|sequential (default: zipfian)" }, + FlagDoc { form: "--cold-start", desc: "start with empty cache instead of pre-populated" }, + FlagDoc { form: "--key-size N", desc: "minimum key string width, zero-padded (default: 0)" }, + FlagDoc { form: "--write-new F", desc: "fraction of writes inserting a fresh key (default: 0.0)" }, + FlagDoc { form: "--seed N", desc: "RNG seed (default: 42)" }, + FlagDoc { form: "--pool-size N", desc: "value pool size (default: 10000)" }, + FlagDoc { form: "--sample-every N", desc: "latency sampling rate 1/N (default: 10)" }, + FlagDoc { form: "--caches / -c LIST", desc: "comma-separated list of caches to run" }, + FlagDoc { form: "--info", desc: "full reference: config, methodology, column definitions" }, + FlagDoc { form: "--help / -h", desc: "print brief flag summary" }, +]; + +fn print_flag_table() { + for f in FLAG_TABLE { + println!(" {:<19} {}", f.form, f.desc); + } +} + fn print_help() { println!("rust-cache-benchmarks — concurrent in-memory cache benchmark\n"); println!("USAGE:"); println!(" cargo run --release [-- [FLAGS]]\n"); println!("FLAGS:"); - println!(" --size N cache_size (default: 30000)"); - println!(" --keys N num_distinct_keys (default: 480000, i.e. 16× cache_size)"); - println!(" --tasks N num_tasks (default: available_parallelism)"); - println!( - " --iters N num_iterations max (default: 30; adaptive stopping may run fewer)" - ); - println!(" --warmup N warmup_iterations (default: 1)"); - println!(" --ratio F read_write_ratio 0.0–1.0 (default: 0.80)"); - println!(" --value-size N value size in bytes (default: 5120)"); - println!(" --zipf F Zipf exponent (default: 1.07) sweep 0.8–1.5"); - println!(" --pattern P access pattern: zipfian|uniform|sequential (default: zipfian)"); - println!(" --cold-start start with empty cache instead of pre-populated"); - println!(" --key-size N minimum key string width, zero-padded (default: 0)"); - println!(" --write-new F fraction of writes inserting a fresh key (default: 0.0)"); - println!(" --seed N RNG seed (default: 42)"); - println!(" --pool-size N value pool size (default: 10000)"); - println!(" --sample-every N latency sampling rate 1/N (default: 10)"); - println!(" --caches / -c LIST comma-separated list of caches to run"); - println!(" --info full reference: config, methodology, column definitions"); - println!(" --help / -h print this help\n"); + print_flag_table(); + println!(); println!(" All flags accept both space-separated and = forms: --zipf 1.2 --zipf=1.2\n"); println!("AVAILABLE CACHES:"); println!(" moka, minimoka, quick_cache, schnellru, stretto, lrucache, tinyufo,"); - println!(" foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue, neocache\n"); + println!(" foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue\n"); println!("EXAMPLES:"); println!(" cargo run --release # run all caches"); - println!(" cargo run --release -- -c moka,neocache # run a subset"); + println!(" cargo run --release -- -c moka,quick_cache # run a subset"); println!(" cargo run --release -- --size 10000 --zipf 0.9 # custom config"); println!(" cargo run --release -- --pattern uniform --ratio 0.5 # different workload"); println!(" cargo run --release -- --info # full reference\n"); @@ -472,6 +564,19 @@ fn parse_args() -> Result<(BenchConfig, std::collections::HashSet), Stri i += 1; } + // Cross-flag validation. A keyspace smaller than the cache means the + // cache will admit every key with no eviction pressure at all — the + // benchmark stops measuring eviction policy quality and becomes a + // hash-table micro-benchmark, which is not what this harness exists for. + // Defaults set num_distinct_keys = 16 × cache_size; require at least 2× + // for any user override so the comparison stays meaningful. + if num_distinct_keys < cache_size * 2 { + return Err(format!( + "--keys ({num_distinct_keys}) must be at least 2× --size ({cache_size}) so the \ + cache experiences eviction pressure. Recommended ratio is 16× (the default)." + )); + } + let config = BenchConfig { cache_size, num_tasks, @@ -818,35 +923,19 @@ fn print_info() { println!("━━━ CLI FLAGS ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); - println!(" --size N cache_size (default: 30000)"); - println!(" --keys N num_distinct_keys (default: 480000, i.e. 16× cache_size)"); - println!(" --tasks N num_tasks (default: available_parallelism)"); - println!(" --iters N num_iterations / max iterations (default: 30)"); - println!(" --warmup N warmup_iterations (default: 1)"); - println!(" --ratio F read_write_ratio 0.0–1.0 (default: 0.80)"); - println!(" --value-size N value size in bytes (default: 5120)"); - println!(" --zipf F Zipf exponent (default: 1.07) sweep 0.8–1.5"); - println!(" --pattern P access pattern: zipfian|uniform|sequential (default: zipfian)"); - println!(" --cold-start start with empty cache instead of pre-populated"); - println!(" --key-size N minimum key string width, zero-padded (default: 0)"); - println!(" --write-new F fraction of writes inserting a fresh key (default: 0.0)"); - println!(" --seed N RNG seed (default: 42)"); - println!(" --pool-size N value pool size (default: 10000)"); - println!(" --sample-every N latency sampling rate 1/N (default: 10)"); - println!(" --caches / -c LIST comma-separated list of caches to run"); - println!(" --info print this full reference"); - println!(" --help / -h print brief flag summary\n"); + print_flag_table(); + println!(); println!(" All flags accept both space-separated and = forms:"); println!(" --zipf 1.2 --zipf=1.2\n"); println!(" Available caches:"); println!(" moka, minimoka, quick_cache, schnellru, stretto, lrucache, tinyufo,"); - println!(" foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue, neocache\n"); + println!(" foyer_cache, cached_lru, clru_cache, lrumem, sieve, two_queue\n"); println!(" Usage:"); println!(" cargo run --release # run all caches"); - println!(" cargo run --release -- -c moka,neocache # run a subset"); + println!(" cargo run --release -- -c moka,quick_cache # run a subset"); println!(" cargo run --release -- --size 10000 --zipf 0.9 # custom config"); println!(" cargo run --release -- --pattern uniform --ratio 0.5 # different workload"); println!(" cargo run --release -- --info # print this reference\n"); @@ -934,12 +1023,11 @@ async fn main() { .map(|(name, results)| (*name, AggregatedStats::from_results(results))) .collect(); - // Sort by median throughput (descending) - stats_map.sort_by(|a, b| { - b.1.throughput_median - .partial_cmp(&a.1.throughput_median) - .unwrap() - }); + // Sort by median throughput (descending). `total_cmp` instead of + // `partial_cmp().unwrap()` so a NaN (which today cannot occur, but + // `total_cmp` is one character longer and one defensive line shorter) + // would simply sort to the end rather than panic the binary. + stats_map.sort_by(|a, b| b.1.throughput_median.total_cmp(&a.1.throughput_median)); // Print combined results table println!( @@ -955,7 +1043,9 @@ async fn main() { "p99cv%", "wp99µs", "hit%", - "±hitci" + "±hitci", + BOLD = bold(), + RESET = reset(), ); println!("{}", "─".repeat(99)); @@ -998,24 +1088,25 @@ async fn main() { println!(" (!) cv% > 10%: throughput is noisy — consider rerunning on a quieter system"); } - println!("{DIM} legend: ops/sec=throughput ±ci95=95% CI cv%=variability tailµs=p99−p50 wp99µs=write p99 ±hitci=hit-rate CI{RESET}"); + println!( + "{DIM} legend: ops/sec=throughput ±ci95=95% CI cv%=variability tailµs=p99−p50 wp99µs=write p99 ±hitci=hit-rate CI{RESET}", + DIM = dim(), + RESET = reset(), + ); - // Overall winners + // Overall winners. `total_cmp` everywhere so a NaN (which today cannot + // occur, but defensive-cmp is cheap) cannot panic the summary line. let (best_throughput, best_throughput_stats) = &stats_map[0]; let best_latency = stats_map .iter() - .min_by(|a, b| { - a.1.latency_p99_median - .partial_cmp(&b.1.latency_p99_median) - .unwrap() - }) + .min_by(|a, b| a.1.latency_p99_median.total_cmp(&b.1.latency_p99_median)) .unwrap(); let best_consistency = stats_map .iter() .min_by(|a, b| { let a_amp = a.1.latency_p99_median - a.1.latency_p50_median; let b_amp = b.1.latency_p99_median - b.1.latency_p50_median; - a_amp.partial_cmp(&b_amp).unwrap() + a_amp.total_cmp(&b_amp) }) .unwrap(); @@ -1027,7 +1118,7 @@ async fn main() { let multi = stats_map.len() > 1; let w = |s: &str| -> String { if multi { - format!("{BOLD}{GREEN}{s}{RESET}") + format!("{}{}{}{}", bold(), green(), s, reset()) } else { s.to_string() }