Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion oximeter/db/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -153,5 +153,9 @@ name = "protocol"
harness = false

[[bench]]
name = "oxql"
name = "oxql_field"
harness = false

[[bench]]
name = "oxql_measurement"
harness = false
34 changes: 31 additions & 3 deletions oximeter/db/benches/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,48 @@ To restore into a test database. Note: take care not to restore into a real Oxid
$ oximeter/db/benches/load_field_tables.sh /tmp/oximeter-field-bench [port]
```

Then run the benchmark:
Then run the benchmark. `BENCH_METRIC` selects between server-side wall-clock (`latency`) and CPU time (`cpu_time`):

```bash
$ cargo bench --package oximeter-db --bench oxql -- --save-baseline main
$ BENCH_METRIC=latency cargo bench --package oximeter-db --bench oxql_field -- --save-baseline main
```

To evaluate performance changes, run the benchmark using a new baseline:

```bash
$ cargo bench --package oximeter-db --bench oxql -- --save-baseline my-branch
$ BENCH_METRIC=latency cargo bench --package oximeter-db --bench oxql_field -- --save-baseline my-branch
```

Then compare with `critcmp`:

```bash
$ critcmp main my-branch
```

## Measurement query

We have a separate benchmark that measures the performance of combined field and measurement lookup, fetching a set of representative series using `| last 1` to simulate the use case of fetching recent metrics to ship to Prometheus or similar. This benchmark requires backing up and restoring measurement tables. Use a limited time window, since these tables grow to tens of gigabytes or more on real racks.

To fetch measurement data:

```bash
$ mkdir -p /tmp/oximeter-measurement-bench
$ START=2026-05-01T00:00:00
$ END=2026-05-01T01:00:00
$ oximeter/db/benches/backup_measurement_tables.sh /tmp/oximeter-measurement-bench measurements_cumulativeu64 $START $END [port]
$ oximeter/db/benches/backup_measurement_tables.sh /tmp/oximeter-measurement-bench measurements_f32 $START $END [port]
```

To restore into a test database:

```bash
$ oximeter/db/benches/load_measurement_tables.sh /tmp/oximeter-measurement-bench measurements_cumulativeu64 [port]
$ oximeter/db/benches/load_measurement_tables.sh /tmp/oximeter-measurement-bench measurements_f32 [port]
```

Run the benchmark. `OXQL_BENCH_START_TIME` and `OXQL_BENCH_END_TIME` should fall within the window you backed, in `YYYY-MM-DDTHH:MM:SS` format:

```bash
$ OXQL_BENCH_START_TIME=$START OXQL_BENCH_END_TIME=$END BENCH_METRIC=latency \
cargo bench --package oximeter-db --bench oxql_measurement -- --save-baseline main
```
2 changes: 1 addition & 1 deletion oximeter/db/benches/backup_field_tables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ for table in timeseries_schema fields_{bool,i8,i16,i32,i64,ipaddr,string,u8,u16,
fi
output="$OUTPUT_DIR/${table}.native.gz"
echo "Backing up $DATABASE.$table ($count rows) to $output"
clickhouse client --port "$PORT" \
clickhouse client --port "$PORT" --compression=1 \
--query "SELECT * FROM $DATABASE.$table FORMAT Native" \
| gzip > "$output"
done
39 changes: 39 additions & 0 deletions oximeter/db/benches/backup_measurement_tables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
#
# Dump a partial ClickHouse measurement table (time-windowed slice) to disk in
# native format. Run against a test rack with realistic oximeter data. Used to
# capture test data for benchmarking.
#
# Usage: ./backup_measurement_tables.sh <output_dir> <table> <window_start> <window_end> [port]

set -euo pipefail

if [[ $# -lt 4 ]]; then
echo "Usage: $0 <output_dir> <table> <window_start> <window_end> [port]" >&2
exit 1
fi

OUTPUT_DIR="$1"
TABLE="$2"
WINDOW_START="$3"
WINDOW_END="$4"
PORT="${5:-9000}"
DATABASE="oximeter"

mkdir -p "$OUTPUT_DIR"

# Back up a single measurement table. These tables can be very large, so we limit to the specified time range, and only operate on one measurement table at a time.

# Note: Use SELECT rather than RESTORE because we may not have access to the
# remote ClickHouse's local disk, or have backups enabled at all.
count=$(clickhouse client --port "$PORT" \
--query "SELECT count() FROM $DATABASE.$TABLE WHERE timestamp >= '$WINDOW_START' AND timestamp < '$WINDOW_END'")
if [[ "$count" -eq 0 ]]; then
echo "No rows in $DATABASE.$TABLE for window; nothing to back up"
exit 0
fi
output="$OUTPUT_DIR/${TABLE}.native.gz"
echo "Backing up $DATABASE.$TABLE ($count rows) to $output"
clickhouse client --port "$PORT" --compression=1 \
--query "SELECT * FROM $DATABASE.$TABLE WHERE timestamp >= '$WINDOW_START' AND timestamp < '$WINDOW_END' FORMAT Native" \
| gzip > "$output"
148 changes: 148 additions & 0 deletions oximeter/db/benches/common/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Shared helpers for OxQL benchmarks.

// Copyright 2026 Oxide Computer Company

use criterion::measurement::WallTime;
use criterion::{BenchmarkGroup, BenchmarkId};
use oximeter_db::Client;
use oximeter_db::oxql::query::QueryAuthzScope;
use std::net::IpAddr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;

pub const DEFAULT_CLICKHOUSE_PORT: u16 = 9000;

/// The metric to benchmark.
///
/// Set via BENCH_METRIC env var.
pub enum BenchMetric {
/// Server-side query latency.
Latency,
/// Total cpu time (user and system).
CpuTime,
}

pub fn bench_metric() -> BenchMetric {
match std::env::var("BENCH_METRIC").as_deref() {
Ok("cpu_time") => BenchMetric::CpuTime,
Ok("latency") => BenchMetric::Latency,
_ => panic!("BENCH_METRIC must be 'cpu_time' or 'latency'"),
}
}

pub fn get_clickhouse_addr() -> IpAddr {
std::env::var("CLICKHOUSE_ADDRESS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or_else(|| IpAddr::from([127, 0, 0, 1]))
}

pub fn get_clickhouse_port() -> u16 {
std::env::var("CLICKHOUSE_PORT")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(DEFAULT_CLICKHOUSE_PORT)
}

pub fn get_socket_addr() -> SocketAddr {
SocketAddr::new(get_clickhouse_addr(), get_clickhouse_port())
}

pub fn get_client(rt: &tokio::runtime::Runtime) -> Arc<Client> {
let addr = get_socket_addr();
let log = slog::Logger::root(slog::Discard, slog::o!());

rt.block_on(async {
let client = Arc::new(Client::new(addr, &log));
client.ping().await.unwrap();
client
})
}

/// Benchmark a single OxQL query using criterion, measuring either server-side
/// latency or cpu time per [`BenchMetric`].
pub fn bench_oxql_query(
group: &mut BenchmarkGroup<'_, WallTime>,
rt: &tokio::runtime::Runtime,
client: Arc<Client>,
bench_name: &str,
bench_id: String,
query: String,
metric: &BenchMetric,
) {
// Run the query once without recording performance to warm caches.
rt.block_on(client.oxql_query(&query, QueryAuthzScope::Fleet)).unwrap();

group.bench_function(BenchmarkId::new(bench_name, &bench_id), |bench| {
match metric {
BenchMetric::CpuTime => {
bench.to_async(rt).iter_custom(|iters| {
let client = client.clone();
let query = query.clone();
async move {
let mut total = Duration::ZERO;
for _ in 0..iters {
let result = client
.oxql_query(&query, QueryAuthzScope::Fleet)
.await
.unwrap();
let cpu_us: i64 = result
.query_summaries
.iter()
.map(|s| {
// Profile events are occasionally and
// inexplicably empty; default to 0
// for rare missing events.
s.profile_summary
.get("UserTimeMicroseconds")
.copied()
.unwrap_or(0)
+ s.profile_summary
.get("SystemTimeMicroseconds")
.copied()
.unwrap_or(0)
})
.sum();
total +=
Duration::from_micros(cpu_us.max(0) as u64);
}
total
}
});
}
BenchMetric::Latency => {
bench.to_async(rt).iter_custom(|iters| {
let client = client.clone();
let query = query.clone();
async move {
let mut total = Duration::ZERO;
for _ in 0..iters {
let result = client
.oxql_query(&query, QueryAuthzScope::Fleet)
.await
.unwrap();
let real_us: i64 = result
.query_summaries
.iter()
.map(|s| {
s.profile_summary
.get("RealTimeMicroseconds")
.copied()
.unwrap_or(0)
})
.sum();
total +=
Duration::from_micros(real_us.max(0) as u64);
}
total
}
});
}
}
});
}
48 changes: 48 additions & 0 deletions oximeter/db/benches/load_measurement_tables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash
#
# Load a measurement table backup into a ClickHouse for benchmarking.
# Crashes if the destination table already contains data.
#
# Usage: ./load_measurement_tables.sh <input_dir> <table> [port]

set -euo pipefail

if [[ $# -lt 2 ]]; then
echo "Usage: $0 <input_dir> <table> [port]" >&2
exit 1
fi

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SCHEMA_DIR="$SCRIPT_DIR/../schema/single-node"

INPUT_DIR="$1"
TABLE="$2"
PORT="${3:-9000}"

DATABASE="oximeter"

# Initialize schema. db-init.sql is fully IF NOT EXISTS, so this is a no-op
# if the schema is already in place.
echo "Initializing database schema..."
clickhouse client --port "$PORT" --multiquery < "$SCHEMA_DIR/db-init.sql"

# Error if destination table already has data.
count=$(clickhouse client --port "$PORT" \
--query "SELECT count() FROM $DATABASE.$TABLE")
if [[ "$count" -gt 0 ]]; then
echo "Error: $DATABASE.$TABLE already contains data ($count rows)"
echo "Refusing to load into a non-empty table."
exit 1
fi

input="$INPUT_DIR/${TABLE}.native.gz"
if [[ ! -f "$input" ]]; then
echo "No backup for table $TABLE in $INPUT_DIR"
exit 1
fi

# Note: Use INSERT rather than RESTORE because we may not have access to the
# local ClickHouse's disk, or have backups enabled at all.
echo "Loading $DATABASE.$TABLE from $input"
gunzip -c "$input" | clickhouse client --port "$PORT" \
--query "INSERT INTO $DATABASE.$TABLE FORMAT Native"
Loading
Loading