From 6ec3806d694228e803f0e5eb45e615761bd78af2 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 13 Jun 2026 20:06:04 +0000 Subject: [PATCH 1/2] perf: reuse unchanged viewport rows in boo ui In `boo ui` every ~15ms frame re-serialized every viewport row through libghostty's ScreenFormatter, allocating a fresh writer per row, then diffed the result against the row cache and discarded unchanged rows. The per-frame cost was O(rows) VT serializations plus O(rows) heap allocations even when a single cell changed, so the UI did far more work than a plain attach and felt slower than raw SSH. Two changes: - `appendTermRow` now formats straight into the caller's buffer via `Allocating.fromArrayList`, reusing capacity instead of allocating a writer per row. - A per-row viewport cache stores each row's serialized bytes and reuses them when libghostty reports the row unchanged. A row is reused only when its row identity (page node and offset) and its dirty bit are both unchanged, then the dirty bits are cleared once per frame. The identity check is required, not just the dirty bit: scrolling the active screen relocates a visual row onto a different libghostty row while the row itself stays clean, so a dirty-only cache would reuse stale bytes. A unit test pins this contract and an integration test covers heavy scrolling end to end. Microbenchmark (bench/render.zig, 50x200, c_allocator): status quo, all rows: 80267 ns/frame, 150 allocs/frame reused buffer, all rows: 76639 ns/frame, 0 allocs/frame 1 changed row, rest cached: 1760 ns/frame Localized updates (typing echo, progress bars, status lines) drop ~46x per frame; full repaints keep the same serialization cost minus the per-row allocations. --- bench/render.zig | 195 +++++++++++++++++++++++++++++++++++++++++++ build.zig | 21 +++++ src/ui.zig | 157 ++++++++++++++++++++++++++++++++-- test/integration.zig | 38 +++++++++ 4 files changed, 403 insertions(+), 8 deletions(-) create mode 100644 bench/render.zig diff --git a/bench/render.zig b/bench/render.zig new file mode 100644 index 0000000..64ff04d --- /dev/null +++ b/bench/render.zig @@ -0,0 +1,195 @@ +//! Microbenchmark for the `boo ui` viewport render hot path. +//! +//! Compares serialization strategies for one repaint frame: +//! A_full status quo: a fresh Allocating writer per row, all rows. +//! B_full reused buffer, all rows (full-repaint frame, e.g. scroll). +//! C_local reused buffer, re-serialize one changed row + reuse the +//! rest from cache (localized-update frame: typing, progress). +//! +//! Build/run: `zig build bench`. Reports ns/frame and the allocation +//! count for A vs B. +const std = @import("std"); +const vt = @import("ghostty-vt"); + +const rows: u16 = 50; +const cols: u16 = 200; +const frames: usize = 2000; + +/// Allocator that counts allocations, delegating to a backing one. +const CountingAllocator = struct { + backing: std.mem.Allocator, + count: usize = 0, + + fn allocator(self: *CountingAllocator) std.mem.Allocator { + return .{ .ptr = self, .vtable = &.{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, + } }; + } + fn alloc(ctx: *anyopaque, len: usize, a: std.mem.Alignment, ra: usize) ?[*]u8 { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + self.count += 1; + return self.backing.rawAlloc(len, a, ra); + } + fn resize(ctx: *anyopaque, m: []u8, a: std.mem.Alignment, n: usize, ra: usize) bool { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + return self.backing.rawResize(m, a, n, ra); + } + fn remap(ctx: *anyopaque, m: []u8, a: std.mem.Alignment, n: usize, ra: usize) ?[*]u8 { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + return self.backing.rawRemap(m, a, n, ra); + } + fn free(ctx: *anyopaque, m: []u8, a: std.mem.Alignment, ra: usize) void { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + self.backing.rawFree(m, a, ra); + } +}; + +/// Status-quo serialization: a fresh Allocating writer per row. +fn rowStatusQuo(alloc: std.mem.Allocator, term: *vt.Terminal, y: u16, out: *std.ArrayList(u8)) !void { + const screen = term.screens.active; + if (term.cols == 0) return; + const start = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = y } }) orelse return; + const end = screen.pages.pin(.{ .viewport = .{ .x = term.cols - 1, .y = y } }) orelse return; + var formatter: vt.formatter.ScreenFormatter = .init(screen, .vt); + formatter.content = .{ .selection = vt.Selection.init(start, end, true) }; + var aw: std.Io.Writer.Allocating = .init(alloc); + defer aw.deinit(); + aw.writer.print("{f}", .{formatter}) catch return error.OutOfMemory; + const bytes = aw.writer.buffered(); + try out.appendSlice(alloc, bytes); + if (std.mem.indexOf(u8, bytes, "\x1b]8;") != null) { + try out.appendSlice(alloc, "\x1b]8;;\x1b\\"); + } +} + +/// Optimized serialization: format directly into the caller's buffer +/// (reused across rows/frames), no per-row allocation. +fn rowReused(alloc: std.mem.Allocator, term: *vt.Terminal, y: u16, out: *std.ArrayList(u8)) !void { + const screen = term.screens.active; + if (term.cols == 0) return; + const start = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = y } }) orelse return; + const end = screen.pages.pin(.{ .viewport = .{ .x = term.cols - 1, .y = y } }) orelse return; + var formatter: vt.formatter.ScreenFormatter = .init(screen, .vt); + formatter.content = .{ .selection = vt.Selection.init(start, end, true) }; + const at = out.items.len; + { + var aw: std.Io.Writer.Allocating = .fromArrayList(alloc, out); + defer out.* = aw.toArrayList(); + aw.writer.print("{f}", .{formatter}) catch return error.OutOfMemory; + } + if (std.mem.indexOf(u8, out.items[at..], "\x1b]8;") != null) { + try out.appendSlice(alloc, "\x1b]8;;\x1b\\"); + } +} + +fn fillScreen(alloc: std.mem.Allocator, term: *vt.Terminal) !void { + var stream = vt.TerminalStream.initAlloc(alloc, vt.TerminalStream.Handler.init(term)); + defer stream.deinit(); + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(alloc); + try buf.appendSlice(alloc, "\x1b[H"); + for (0..rows) |y| { + // A mix of default and 256-color SGR segments per row. + var x: usize = 0; + while (x < cols - 12) : (x += 12) { + const color: usize = (y * 7 + x) % 231 + 16; + try buf.print(alloc, "\x1b[38;5;{d}mword{d:0>2} ", .{ color, (x / 12) % 100 }); + } + try buf.appendSlice(alloc, "\x1b[0m"); + if (y + 1 < rows) try buf.appendSlice(alloc, "\r\n"); + } + stream.nextSlice(buf.items); +} + +pub fn main() !void { + // boo runs on the C allocator at runtime (src/main.zig); benchmark + // with the same one so per-row allocation cost is realistic. + const base = std.heap.c_allocator; + + var stdout_buf: [4096]u8 = undefined; + var stdout_w = std.fs.File.stdout().writer(&stdout_buf); + const out = &stdout_w.interface; + + try out.print("boo ui render bench: {d} rows x {d} cols, {d} frames\n\n", .{ rows, cols, frames }); + + // --- A_full: status quo, all rows, per-row Allocating --- + { + var ca: CountingAllocator = .{ .backing = base }; + const alloc = ca.allocator(); + var term = try vt.Terminal.init(alloc, .{ .cols = cols, .rows = rows, .max_scrollback = 512 * 1024 }); + defer term.deinit(alloc); + try fillScreen(alloc, &term); + + var body: std.ArrayList(u8) = .empty; + defer body.deinit(alloc); + const alloc_before = ca.count; + var timer = try std.time.Timer.start(); + for (0..frames) |_| { + body.clearRetainingCapacity(); + for (0..rows) |y| try rowStatusQuo(alloc, &term, @intCast(y), &body); + } + const ns = timer.read(); + try out.print("A_full (status quo, all rows): {d:>7} ns/frame, {d:>7} allocs/frame\n", .{ + ns / frames, (ca.count - alloc_before) / frames, + }); + } + + // --- B_full: reused buffer, all rows --- + { + var ca: CountingAllocator = .{ .backing = base }; + const alloc = ca.allocator(); + var term = try vt.Terminal.init(alloc, .{ .cols = cols, .rows = rows, .max_scrollback = 512 * 1024 }); + defer term.deinit(alloc); + try fillScreen(alloc, &term); + + var body: std.ArrayList(u8) = .empty; + defer body.deinit(alloc); + const alloc_before = ca.count; + var timer = try std.time.Timer.start(); + for (0..frames) |_| { + body.clearRetainingCapacity(); + for (0..rows) |y| try rowReused(alloc, &term, @intCast(y), &body); + } + const ns = timer.read(); + try out.print("B_full (reused buf, all rows): {d:>7} ns/frame, {d:>7} allocs/frame\n", .{ + ns / frames, (ca.count - alloc_before) / frames, + }); + } + + // --- C_local: one changed row re-serialized, rest reused from cache --- + { + const alloc = base; + var term = try vt.Terminal.init(alloc, .{ .cols = cols, .rows = rows, .max_scrollback = 512 * 1024 }); + defer term.deinit(alloc); + try fillScreen(alloc, &term); + + // Per-row cache buffers, primed once. + var cache: [rows]std.ArrayList(u8) = undefined; + for (&cache) |*c| c.* = .empty; + defer for (&cache) |*c| c.deinit(alloc); + for (0..rows) |y| try rowReused(alloc, &term, @intCast(y), &cache[y]); + + var body: std.ArrayList(u8) = .empty; + defer body.deinit(alloc); + var timer = try std.time.Timer.start(); + for (0..frames) |i| { + body.clearRetainingCapacity(); + // One row is "dirty" this frame; re-serialize it, reuse rest. + const dirty: u16 = @intCast(i % rows); + for (0..rows) |y| { + if (y == dirty) { + cache[y].clearRetainingCapacity(); + try rowReused(alloc, &term, @intCast(y), &cache[y]); + } + try body.appendSlice(alloc, cache[y].items); + } + } + const ns = timer.read(); + try out.print("C_local (1 dirty row, rest cached): {d:>7} ns/frame\n", .{ns / frames}); + } + + try out.flush(); +} diff --git a/build.zig b/build.zig index 9e9f335..9df46a4 100644 --- a/build.zig +++ b/build.zig @@ -68,4 +68,25 @@ pub fn build(b: *std.Build) void { test_all_step.dependOn(test_step); test_all_step.dependOn(integration_step); + + // Benchmark: the viewport render hot path (no TTY required). + const bench_mod = b.createModule(.{ + .root_source_file = b.path("bench/render.zig"), + .target = target, + .optimize = optimize, + .link_libc = true, + }); + if (b.lazyDependency("ghostty", .{ + .target = target, + .optimize = optimize, + })) |dep| { + bench_mod.addImport("ghostty-vt", dep.module("ghostty-vt")); + } + const bench_exe = b.addExecutable(.{ + .name = "boo-bench", + .root_module = bench_mod, + }); + const bench_run = b.addRunArtifact(bench_exe); + const bench_step = b.step("bench", "Run the render microbenchmark"); + bench_step.dependOn(&bench_run.step); } diff --git a/src/ui.zig b/src/ui.zig index fe7568c..dc4ca0b 100644 --- a/src/ui.zig +++ b/src/ui.zig @@ -1044,6 +1044,40 @@ pub fn run(alloc: std.mem.Allocator, dir: []const u8) !void { try ui.loop(pipe_fds[0]); } +/// Cached serialization of one viewport (terminal) row, keyed on the +/// libghostty row identity so a row that scrolls to a new position is +/// re-serialized even when its own contents did not change. +const ViewportRow = struct { + /// The bytes `appendTermRow` produced for this row last time. + bytes: std.ArrayList(u8) = .empty, + /// The page node the cached row lived in, compared by pointer + /// identity. Null until first serialized. + node: ?*const anyopaque = null, + /// The row offset within `node`. + offset: u16 = 0, + /// Whether `bytes`/`node`/`offset` hold a serialized row. + valid: bool = false, + + fn deinit(self: *ViewportRow, alloc: std.mem.Allocator) void { + self.bytes.deinit(alloc); + } +}; + +/// Whether `entry` may be reused for the row currently at `pin` instead +/// of re-serializing it. Reuse is safe only when a full repaint is not +/// forced, the entry holds a serialized row, the libghostty row identity +/// (page node and offset within it) is unchanged, and the row is not +/// dirty. Scrolling the active screen relocates a visual row onto a +/// different identity even while its own bytes stay clean, so the +/// identity comparison is required and the dirty bit alone is not +/// enough. +fn viewportRowReusable(entry: *const ViewportRow, pin: vt.Pin, full_render: bool) bool { + if (full_render or !entry.valid) return false; + if (entry.node != @as(*const anyopaque, @ptrCast(pin.node))) return false; + if (entry.offset != pin.y) return false; + return !pin.isDirty(); +} + const Ui = struct { alloc: std.mem.Allocator, dir: []const u8, @@ -1106,6 +1140,9 @@ const Ui = struct { /// Per-screen-row cache of the last emitted bytes; rows that did /// not change are not re-sent. row_cache: std.ArrayList(std.ArrayList(u8)) = .empty, + /// Per-screen-row cache of the serialized viewport row bytes, + /// reused across frames when libghostty reports the row unchanged. + viewport_cache: std.ArrayList(ViewportRow) = .empty, need_render: bool = true, /// Force every row out on the next render (resize, C-a l). full_render: bool = true, @@ -1145,6 +1182,8 @@ const Ui = struct { self.message.deinit(self.alloc); for (self.row_cache.items) |*row| row.deinit(self.alloc); self.row_cache.deinit(self.alloc); + for (self.viewport_cache.items) |*row| row.deinit(self.alloc); + self.viewport_cache.deinit(self.alloc); } // -- Main loop --------------------------------------------------------- @@ -2617,6 +2656,15 @@ const Ui = struct { row.deinit(alloc); } + // The viewport cache tracks the same rows as the row cache. + while (self.viewport_cache.items.len < l.rows) { + try self.viewport_cache.append(alloc, .{}); + } + while (self.viewport_cache.items.len > l.rows) { + var row = self.viewport_cache.pop() orelse break; + row.deinit(alloc); + } + var body: std.ArrayList(u8) = .empty; defer body.deinit(alloc); @@ -2638,6 +2686,10 @@ const Ui = struct { const cursor = self.cursorSequence(); + // The frame consumed this round's dirty bits; clear them so the + // next frame's viewport cache reuse reflects only new changes. + if (self.liveView()) |v| v.term.screens.active.pages.clearDirty(); + if (body.items.len == 0 and !self.full_render) { // Row content unchanged; the cursor may still have moved. try frame.appendSlice(alloc, "\x1b[?25l"); @@ -2828,6 +2880,41 @@ const Ui = struct { try appendClipped(alloc, out, "", w); } + /// Append the serialized bytes for viewport row `y`, reusing the + /// cached serialization when libghostty reports the row unchanged. + /// + /// A row is reused only when its libghostty identity (the page node + /// and the offset within it) is unchanged and its dirty bit is + /// clear. Scrolling the active screen moves a visual row onto a + /// different page row, changing the identity and forcing a fresh + /// serialization; an in-place edit sets the dirty bit. `composeFrame` + /// clears the dirty bits once per frame, so a clear bit means + /// "unchanged since the last serialization". + fn appendViewportRow(self: *Ui, v: *View, y: u16, out: *std.ArrayList(u8)) !void { + const alloc = self.alloc; + const screen = v.term.screens.active; + const pin = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = y } }) orelse { + if (y < self.viewport_cache.items.len) { + self.viewport_cache.items[y].valid = false; + } + return; + }; + const entry = &self.viewport_cache.items[y]; + const node: *const anyopaque = @ptrCast(pin.node); + + if (viewportRowReusable(entry, pin, self.full_render)) { + try out.appendSlice(alloc, entry.bytes.items); + return; + } + + entry.bytes.clearRetainingCapacity(); + try appendTermRow(alloc, &v.term, y, &entry.bytes); + entry.node = node; + entry.offset = pin.y; + entry.valid = true; + try out.appendSlice(alloc, entry.bytes.items); + } + fn composeViewportCell(self: *Ui, y: u16, out: *std.ArrayList(u8)) !void { const alloc = self.alloc; @@ -2864,7 +2951,7 @@ const Ui = struct { } if (y < v.term.rows) { - try appendTermRow(alloc, &v.term, y, out); + try self.appendViewportRow(v, y, out); } try out.appendSlice(alloc, sgr_reset); @@ -2972,15 +3059,17 @@ pub fn appendTermRow( var formatter: vt.formatter.ScreenFormatter = .init(screen, .vt); formatter.content = .{ .selection = vt.Selection.init(start, end, true) }; - var aw: std.Io.Writer.Allocating = .init(alloc); - defer aw.deinit(); - aw.writer.print("{f}", .{formatter}) catch return error.OutOfMemory; - - const bytes = aw.writer.buffered(); - try out.appendSlice(alloc, bytes); + // Format straight into `out`, reusing its capacity, so a repaint + // does not allocate a fresh writer for every row. + const begin = out.items.len; + { + var aw: std.Io.Writer.Allocating = .fromArrayList(alloc, out); + defer out.* = aw.toArrayList(); + aw.writer.print("{f}", .{formatter}) catch return error.OutOfMemory; + } // A row that opened a hyperlink must not leak it into the next // row or the sidebar. - if (std.mem.indexOf(u8, bytes, "\x1b]8;") != null) { + if (std.mem.indexOf(u8, out.items[begin..], "\x1b]8;") != null) { try out.appendSlice(alloc, "\x1b]8;;\x1b\\"); } } @@ -3989,3 +4078,55 @@ test "appendTermRow renders styled content for one row only" { try appendTermRow(alloc, &term, 3, &out); try std.testing.expectEqual(@as(usize, 0), out.items.len); } + +test "viewportRowReusable re-serializes a clean row that scrolled away" { + const alloc = std.testing.allocator; + + var term = try vt.Terminal.init(alloc, .{ .cols = 20, .rows = 4 }); + defer term.deinit(alloc); + var stream = term.vtStream(); + defer stream.deinit(); + + stream.nextSlice("\x1b[HAAA\r\nBBB\r\nCCC\r\nDDD"); + const screen = term.screens.active; + + // Mimic a settled frame: one cache entry per viewport row, tagged + // with that row's libghostty identity, then clear the dirty bits. + var entries: [4]ViewportRow = .{ .{}, .{}, .{}, .{} }; + defer for (&entries) |*e| e.deinit(alloc); + for (0..4) |y| { + const pin = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = @intCast(y) } }).?; + try appendTermRow(alloc, &term, @intCast(y), &entries[y].bytes); + entries[y].node = @ptrCast(pin.node); + entries[y].offset = pin.y; + entries[y].valid = true; + } + screen.pages.clearDirty(); + + // Nothing changed: every settled row is reusable. + for (0..4) |y| { + const pin = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = @intCast(y) } }).?; + try std.testing.expect(viewportRowReusable(&entries[y], pin, false)); + } + + // Scroll one line. The rows that moved up are not marked dirty, but + // they now show different content, so their stale cache entries must + // not be reused: clean-but-moved is exactly what the dirty bit + // misses and the identity check catches. + stream.nextSlice("\r\nEEE"); + var clean_moved: usize = 0; + for (0..4) |y| { + const pin = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = @intCast(y) } }).?; + if (!pin.isDirty()) { + clean_moved += 1; + try std.testing.expect(!viewportRowReusable(&entries[y], pin, false)); + } + } + // The scroll must have produced a clean-but-moved row, or this test + // would not exercise the identity check at all. + try std.testing.expect(clean_moved > 0); + + // A forced full repaint never reuses, even an unchanged row. + const pin0 = screen.pages.pin(.{ .viewport = .{ .x = 0, .y = 0 } }).?; + try std.testing.expect(!viewportRowReusable(&entries[0], pin0, true)); +} diff --git a/test/integration.zig b/test/integration.zig index d562f44..70242c0 100644 --- a/test/integration.zig +++ b/test/integration.zig @@ -1444,6 +1444,44 @@ test "ui: a row touching the viewport's right edge keeps its last cell" { try std.testing.expect(std.mem.indexOf(u8, first, "edge") != null); } +test "ui: scrolling output keeps the viewport in sync with the session" { + const alloc = std.testing.allocator; + var h = try Harness.init(alloc); + defer h.deinit(); + + try h.startDetached("scroll", &.{"sh"}); + + var ui = try PtyClient.spawn(&h, &.{"ui"}, 24, 100); + defer ui.deinit(); + try ui.waitFor("scroll"); + + // Print far more lines than the viewport is tall so the active + // screen scrolls many times. Each scroll moves every visible row + // onto a different libghostty row, so the viewport cache must key + // on row identity and not reuse a stale serialization. + try h.sendLine("scroll", "i=1; while [ $i -le 200 ]; do echo LINE-$i; i=$((i+1)); done; echo DONE-MARK"); + try ui.waitFor("DONE-MARK"); + + const screen = try renderScreen(alloc, ui.output.items, 24, 100); + defer alloc.free(screen); + + // Every LINE-N still on screen must appear in strictly increasing + // order, and the newest line must have rendered. A stale reused row + // would put an older number out of sequence or duplicate one. + var prev: i64 = -1; + var idx: usize = 0; + while (std.mem.indexOfPos(u8, screen, idx, "LINE-")) |pos| { + var end = pos + "LINE-".len; + while (end < screen.len and std.ascii.isDigit(screen[end])) end += 1; + idx = pos + "LINE-".len; + if (end == idx) continue; // "LINE-$i" from the echoed command + const n = std.fmt.parseInt(i64, screen[idx..end], 10) catch continue; + try std.testing.expect(n > prev); + prev = n; + } + try std.testing.expectEqual(@as(i64, 200), prev); +} + test "ui: the empty state shows the ghost and the keybind hint" { const alloc = std.testing.allocator; var h = try Harness.init(alloc); From b6617f6df1f8c5cccb2cb3ab65dbb2d9b1508ecc Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 13 Jun 2026 20:13:04 +0000 Subject: [PATCH 2/2] fix(test/integration.zig): avoid command-echo race in ui scroll test waitFor("DONE-MARK") could match the echoed command line, which contains the literal marker, before any loop output rendered. On macOS the wait returned immediately and the screen held no LINE-N yet, so the monotonic check saw nothing (expected 200, found -1). Wait on "LINE-200" instead, which the command does not contain literally, so the wait only matches real output. --- test/integration.zig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/integration.zig b/test/integration.zig index 70242c0..42dda1c 100644 --- a/test/integration.zig +++ b/test/integration.zig @@ -1458,9 +1458,11 @@ test "ui: scrolling output keeps the viewport in sync with the session" { // Print far more lines than the viewport is tall so the active // screen scrolls many times. Each scroll moves every visible row // onto a different libghostty row, so the viewport cache must key - // on row identity and not reuse a stale serialization. - try h.sendLine("scroll", "i=1; while [ $i -le 200 ]; do echo LINE-$i; i=$((i+1)); done; echo DONE-MARK"); - try ui.waitFor("DONE-MARK"); + // on row identity and not reuse a stale serialization. Wait on + // "LINE-200" (which the echoed command does not contain literally) + // so the wait cannot race the command echo. + try h.sendLine("scroll", "i=1; while [ $i -le 200 ]; do echo LINE-$i; i=$((i+1)); done"); + try ui.waitFor("LINE-200"); const screen = try renderScreen(alloc, ui.output.items, 24, 100); defer alloc.free(screen);