Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@ wait-timeout = "0.2"
criterion = { version = "0.5", default-features = false, features = ["html_reports"] }
proptest = "1"

[features]
# Opt-in stress bench group for `cargo bench --bench diff`. Adds a 20_000-
# component-per-side diff workload that pushes total bench wall time past the
# default <30s budget. CI and weekend perf checks should leave this off; use
# it locally when investigating algorithmic changes (#29).
bench-stress = []

[[bench]]
name = "parse"
harness = false
Expand Down
171 changes: 129 additions & 42 deletions benches/diff.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
//! Benchmarks for the diff core. Measures `diff::diff(before, after)` on
//! the bundled axios-incident fixture pair (~3 components per side, the
//! shape of a typical small PR diff) and on a synthetic large pair (200
//! components per side, simulating a monorepo SBOM).
//! Benchmarks for the diff core. Closes #29.
//!
//! The synthetic large pair is generated in-process to avoid committing a
//! 200-component fixture file. The shape is deterministic so the bench
//! numbers are stable across runs.
//! The diff core (`src/diff/`) is on the critical path for every bomdrift
//! run, so we want a perf-regression catcher for any change that touches it.
//! This bench measures `diff::diff(before, after)` across three input shapes
//! (small / mid / large) and three diff workloads per shape:
//!
//! - **end_to_end**: realistic mix of added / removed / version_changed /
//! license_changed — the production hot path.
//! - **self_diff**: identical inputs, exercises the BTreeMap construction and
//! per-key traversal without producing any change pairs. Isolates the cost
//! of `group_by_key` + iteration.
//! - **all_license_changed**: every key intersects, every intersecting pair
//! has a different license set. Isolates the license-comparison branch in
//! `diff_one_key`.
//!
//! Input sizes mirror real-world bomdrift workloads:
//!
//! - **small**: 500 components per side (typical mid-sized JS app).
//! - **large**: 5000 components per side (typical large monorepo).
//! - **stress**: 20_000 components per side (upper-bound stress, gated behind
//! the `bench-stress` cargo feature so the default run stays under 30s).
//!
//! Run with `cargo bench --bench diff`.
//! Run with stress group: `cargo bench --bench diff --features bench-stress`.

use std::fs;

use criterion::{Criterion, black_box, criterion_group, criterion_main};
use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};

use bomdrift::diff;
use bomdrift::model::{Component, Ecosystem, Relationship, Sbom, SbomFormat};
Expand All @@ -23,19 +38,17 @@ fn load(path: &str) -> Sbom {
parse::parse_with_format(v, None).expect("must normalize to Sbom")
}

fn synth_component(i: usize, version_offset: usize) -> Component {
let name = format!("pkg-{i:04}");
let mut version = format!("1.{}.0", i % 50);
if i.is_multiple_of(2) {
version = format!("1.{}.0", (i % 50) + version_offset);
}
/// Build one synthetic component. Deterministic — given `i` and `licenses`,
/// the output is byte-identical across runs so bench medians stay stable.
fn synth_component(i: usize, version: &str, licenses: Vec<String>) -> Component {
let name = format!("pkg-{i:06}");
let purl = format!("pkg:npm/{name}@{version}");
Component {
name: name.clone(),
version,
name,
version: version.to_string(),
ecosystem: Ecosystem::Npm,
purl: Some(purl.clone()),
licenses: vec!["MIT".to_string()],
licenses,
supplier: None,
hashes: Vec::new(),
relationship: Relationship::Unknown,
Expand All @@ -44,48 +57,122 @@ fn synth_component(i: usize, version_offset: usize) -> Component {
}
}

fn synth_sbom(n: usize, version_offset: usize) -> Sbom {
let components = (0..n).map(|i| synth_component(i, version_offset)).collect();
/// Build a baseline SBOM of `n` components, all at version 1.0.0, all MIT.
fn synth_sbom(n: usize) -> Sbom {
let components = (0..n)
.map(|i| synth_component(i, "1.0.0", vec!["MIT".to_string()]))
.collect();
Sbom {
format: SbomFormat::CycloneDx,
serial: None,
components,
}
}

fn bench_diff(c: &mut Criterion) {
let mut g = c.benchmark_group("diff");
/// Build the realistic-mix `after` SBOM for the **end_to_end** workload:
///
/// - 80% of keys: same version, same license (no change — the common case).
/// - 10%: version bumped (`version_changed`).
/// - 5%: license changed in place (`license_changed`).
/// - 5%: new keys not present in `before` (`added`); these replace removed
/// tail keys so the size stays `n`. The removed-side count for the diff
/// is the symmetric `before` tail.
fn synth_after_mixed(n: usize) -> Sbom {
let components = (0..n)
.map(|i| match i % 20 {
// 5% version-changed (i % 20 in {0})
0 => synth_component(i, "1.0.1", vec!["MIT".to_string()]),
// 5% another version-changed slice (10% total)
10 => synth_component(i, "2.0.0", vec!["MIT".to_string()]),
// 5% license-changed in place
1 => synth_component(i, "1.0.0", vec!["Apache-2.0".to_string()]),
// 5% new keys (use a disjoint index range so they don't collide)
2 => synth_component(n + i, "1.0.0", vec!["MIT".to_string()]),
// 80% unchanged
_ => synth_component(i, "1.0.0", vec!["MIT".to_string()]),
})
.collect();
Sbom {
format: SbomFormat::CycloneDx,
serial: None,
components,
}
}

// Real fixture pair (axios incident: 3-4 components per side).
let before = load("tests/fixtures/cdx-minimal.json");
let after = load("tests/fixtures/cdx-after.json");
g.bench_function("axios_fixture_pair", |b| {
/// Build an `after` SBOM where every key intersects with `before` at the same
/// version but with a different license — isolates the license-comparison
/// branch in `diff_one_key`.
fn synth_after_all_license_changed(n: usize) -> Sbom {
let components = (0..n)
.map(|i| synth_component(i, "1.0.0", vec!["Apache-2.0".to_string()]))
.collect();
Sbom {
format: SbomFormat::CycloneDx,
serial: None,
components,
}
}

fn bench_diff(c: &mut Criterion) {
// Real fixture pair (axios incident: 3-4 components per side). Kept from
// the original bench as a smoke check that the bench harness still wires
// through the real parse → diff path, not just synthetic data.
let fixture_before = load("tests/fixtures/cdx-minimal.json");
let fixture_after = load("tests/fixtures/cdx-after.json");
c.bench_function("diff_axios_fixture_pair", |b| {
b.iter(|| {
let cs = diff::diff(black_box(&before), black_box(&after));
let cs = diff::diff(black_box(&fixture_before), black_box(&fixture_after));
black_box(cs);
});
});

// Synthetic monorepo-scale pair (200 components per side, half
// version-changed).
let synth_before = synth_sbom(200, 0);
let synth_after = synth_sbom(200, 1);
g.bench_function("synth_monorepo_200", |b| {
b.iter(|| {
let cs = diff::diff(black_box(&synth_before), black_box(&synth_after));
black_box(cs);
// Synthetic sizes. `bench-stress` adds the 20_000-component group; the
// default set targets the "under 30s total" acceptance criterion.
let mut sizes: Vec<usize> = vec![500, 5_000];
if cfg!(feature = "bench-stress") {
sizes.push(20_000);
}

let mut g = c.benchmark_group("diff_synth");
for &n in &sizes {
// Throughput is reported in components/sec, summed across both sides
// of the diff. Lets reviewers see whether a change is a per-component
// hit or a structural one when the numbers cross sizes.
g.throughput(Throughput::Elements((n as u64) * 2));

let before = synth_sbom(n);
let after_mixed = synth_after_mixed(n);
let after_all_lic = synth_after_all_license_changed(n);

// end_to_end: realistic mix of all four change kinds.
g.bench_with_input(BenchmarkId::new("end_to_end", n), &n, |b, _| {
b.iter(|| {
let cs = diff::diff(black_box(&before), black_box(&after_mixed));
black_box(cs);
});
});
});

// Self-diff (no changes): exercises every key through the BTreeMap
// intersection without producing any add/remove/version_changed work.
g.bench_function("synth_self_diff_200", |b| {
b.iter(|| {
let cs = diff::diff(black_box(&synth_before), black_box(&synth_before));
black_box(cs);
// self_diff: identical inputs. Isolates the BTreeMap construction
// (`group_by_key`) + per-key traversal cost with no change pairs
// produced. This is the lower bound on diff cost for a given size.
g.bench_with_input(BenchmarkId::new("self_diff", n), &n, |b, _| {
b.iter(|| {
let cs = diff::diff(black_box(&before), black_box(&before));
black_box(cs);
});
});
});

// all_license_changed: every intersecting pair has a different
// license set. Isolates the license-comparison branch in
// `diff_one_key` (the version-intersection scan that routes pairs
// to `license_changed`).
g.bench_with_input(BenchmarkId::new("all_license_changed", n), &n, |b, _| {
b.iter(|| {
let cs = diff::diff(black_box(&before), black_box(&after_all_lic));
black_box(cs);
});
});
}
g.finish();
}

Expand Down
26 changes: 26 additions & 0 deletions docs/src/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,29 @@ beyond what `ureq` already brings.
the release until split or explicitly waived in the changelog.
- Tests-only files (`tests/**`) are exempt — large integration tests
are easier to read as one file than as a maze of helpers.

## Perf reference: diff core benchmark

The diff core (`src/diff/`) runs on every bomdrift invocation, so it
has its own criterion bench under [`benches/diff.rs`][diff-bench].
That bench is the perf reference for any change that touches diff
internals (`group_by_key`, `diff_one_key`, the multi-version
pair-by-version logic).

Three workloads per size (500, 5_000, opt-in 20_000 with
`--features bench-stress`):

- **end_to_end** — realistic mix (80% unchanged, 10% version_changed,
5% license_changed, 5% added/removed). Production hot path.
- **self_diff** — identical inputs. Lower bound on diff cost; isolates
`group_by_key` BTreeMap construction + per-key iteration with no
change-pair work.
- **all_license_changed** — every intersecting pair has a different
license set. Isolates the license-comparison branch.

Run with `cargo bench --bench diff` (under 30s on a laptop). Record
medians in the PR description for any change that touches
`src/diff/`; a >5% regression on the production `end_to_end / 5000`
workload warrants either a fix or an explicit explanation.

[diff-bench]: https://github.com/Metbcy/bomdrift/blob/main/benches/diff.rs
Loading