From bdb7d64ea08df15b7d15924e5e5f667c3ffdb42b Mon Sep 17 00:00:00 2001 From: yawzhang Date: Wed, 17 Jun 2026 15:26:43 +0800 Subject: [PATCH] SDSTOR-22729: RCA and ut for index root split crash --- .../bug-root-split-crash-sb-stale-root.md | 154 +++++ docs/structures/index.md | 619 ++++++++++++++++++ docs/structures/index_btree_nodes.md | 177 +++++ docs/structures/index_cp_and_recovery.md | 244 +++++++ src/tests/test_index_crash_recovery.cpp | 63 ++ 5 files changed, 1257 insertions(+) create mode 100644 docs/incidents/bug-root-split-crash-sb-stale-root.md create mode 100644 docs/structures/index.md create mode 100644 docs/structures/index_btree_nodes.md create mode 100644 docs/structures/index_cp_and_recovery.md diff --git a/docs/incidents/bug-root-split-crash-sb-stale-root.md b/docs/incidents/bug-root-split-crash-sb-stale-root.md new file mode 100644 index 000000000..a8fa75b7b --- /dev/null +++ b/docs/incidents/bug-root-split-crash-sb-stale-root.md @@ -0,0 +1,154 @@ +# RCA: Root Split Crash Leaves SB Pointing to Old Root + +**Date:** 2026-05-22 +**Component:** HomeStore / IndexTable / BTree / IndexWBCache +**Severity:** Critical (crash on restart, blocks index recovery) + +--- + +## Symptom + +After a SIGKILL, the process crashes during restart with a B-tree sanity failure. + +In release builds: + +``` +Child node level mismatch ... child level: 1, expected: 0 +``` + +In debug builds, recovery may abort earlier inside `IndexTable::repair_root_node`: + +``` +root already has a valid edge ..., so we should have found the new root node +``` + +--- + +## Root Cause Chain + +### Condition 1: `on_root_changed` is called before `split_node` completes + +`Btree::check_split_root` (`src/include/homestore/btree/detail/btree_mutate_impl.ipp`): + +``` +1. Allocate new_root (level=2) +2. Call on_root_changed(new_root) ← SB and journal updated here +3. split_node(new_root, old_root, ...) ← old root modified here +``` + +`IndexTable::on_root_changed` updates the in-memory SB (`index_table_sb.root_node`, +`btree_depth`) and calls `wb_cache().transact_bufs(meta_buf, new_root_buf)`, which +links `meta_buf → new_root_buf` in the CP flush DAG and appends a meta/root +transaction to the journal. + +After step 3, `split_node` calls `transact_nodes({child_node2}, {}, old_root, new_root)`, +which invokes `link_buf(new_root_buf, old_root_buf)`. Because `new_root_buf` was +allocated in the current CP (`m_created_cp_id == icp_ctx->id()`), `link_buf`'s +Condition 1 (`wb_cache.cpp:373`) bypasses new_root_buf and substitutes +`new_root_buf->m_up_buffer = meta_buf` as the real up-buffer. As a result, +old_root_buf links **directly to meta_buf**, not through new_root_buf. + +**Final CP flush DAG:** + +``` +meta_buf [wait=2] + ├── new_root_buf [independent flush] + └── old_root_buf [wait=1] + └── child_node2_buf +``` + +new_root_buf and old_root_buf are siblings under meta_buf and can flush in any order. + +### Condition 2: split_node writes old_root into a transient disk state with empty `edge_info` + +`split_node` modifies old_root in memory: + +- `child_node1->set_next_bnode(child_node2->node_id())` — old_root.next_bnode = child_node2.blkid +- `move_out_to_right_by_size(...)` → `invalidate_edge()` — old_root.edge_info = EMPTY +- `parent_node->update(parent_ind, child_node2->link_info())` — new_root.edge_info = child_node2 + +old_root_buf is then flushed to disk as a direct down-buffer of meta_buf (see Condition 1). +On-disk state of old_root after flush: + +- `old_root.level = 1` +- `old_root.edge_info = EMPTY` +- `old_root.next_bnode = child_node2.blkid` (child_node2 is a level=1 interior node) + +This transient state is only valid within the full tree structure; it is resolved +when new_root takes ownership. If the process crashes here and recovery restarts from +old_root in this state, the structure is misinterpreted (see Condition 5). + +### Condition 3: SB persistence is deferred to end-of-CP flush + +Even though `on_root_changed` updated the in-memory SB, the persisted superblock +write happens only after the last buffer completes during CP flush +(`IndexWBCache::process_write_completion` → `index_service().write_sb(ordinal)` +in `src/lib/index/wb_cache.cpp`). + +A SIGKILL after Condition 2 but before this deferred write leaves the on-disk SB still +pointing to the old root. +**Evidence (gdb):** `index_table_sb.root_node=1125904201810030, btree_depth=1` +— matches the old root, not the new level-2 root. + +### Condition 4: A new level-2 root exists on disk but is not referenced by the SB + +Recovery DAG logs show: `id=1407379178523696 ... INTERIOR level=2 ... NEW` + +The new root was written and is present on disk, but the persisted SB still points +to old_root (Condition 3). Recovery therefore starts from the old root. + +From gdb on coredump: + +- `index_table_sb.root_node = 1125904201810030` +- `index_table_sb.btree_depth = 1` + +### Condition 5: `repair_root_node` incorrectly writes old root's `edge_info` from `next_bnode` + +`IndexWBCache::recover_buf` → `IndexService::update_root` → `IndexTable::repair_root_node` +(`src/include/homestore/index/index_table.hpp`). + +`repair_root_node` is designed to repair a root-change marker by converting `next_bnode` +into `edge_info`. With old_root's on-disk state from Condition 2 +(`edge_info=EMPTY`, `next_bnode=child_node2.blkid`), it executes: + +```cpp +auto edge_id = n->next_bnode(); // = child_node2.blkid (level=1 interior) +n->set_next_bnode(empty_bnodeid); +n->set_edge_value(BtreeLinkInfo{edge_id, 0}); // sets old_root.edge_info = child_node2 (level=1)! +write_node_impl(n, cp_ctx); +``` + +This writes `old_root.edge_info = child_node2.blkid` where child_node2 is a level=1 +interior node — violating the invariant that a level=1 parent's edge child must be level=0. + +**Evidence (gdb on coredump, state captured after repair_root_node ran):** + +- `node_id = 1125904201810030` (old_root) +- `level = 1` +- `edge_info.m_bnodeid = 1125904201813044` (confirmed level=1 interior node) + +Subsequent B-tree validation (`validate_node`) finds a level=1 parent with a level=1 edge +child and aborts with: + +``` +Child node level mismatch ... child level: 1, expected: 0 +``` + +In debug builds, `repair_root_node` additionally asserts when it finds `has_valid_edge()` +already true on the SB root (it expects to find the new root instead), triggering: + +``` +root already has a valid edge ..., so we should have found the new root node +``` + +--- + +## Trigger Conditions + +The following sequence must all occur: + +1. A B-tree root split is triggered (tree height increases from level=1 to level=2) +2. `on_root_changed` runs and links `meta_buf → new_root_buf` in the CP flush DAG +3. `split_node` completes: old_root.edge_info is invalidated, old_root.next_bnode = child_node2.blkid; old_root_buf is flushed to disk +4. SIGKILL arrives after step 3 but before CP flush completes and writes the updated SB + diff --git a/docs/structures/index.md b/docs/structures/index.md new file mode 100644 index 000000000..253b2f64f --- /dev/null +++ b/docs/structures/index.md @@ -0,0 +1,619 @@ +# Index Write-Back Cache and Checkpoint Flush Mechanism + +This document describes the structure and flow of HomeStore's index write-back cache (WBC) and checkpoint (CP) flush mechanism. + +## 0. Glossary + +| Term | Definition | +|----------------------------------|----------------------------------------------------------------------------------------------------------------------------| +| **IndexWBCache** | Write-back cache managing all index node buffers, handling allocation, dirty tracking, and flush orchestration | +| **IndexBuffer** | In-memory representation of a B-tree node with metadata (blkid, dirty CP, creation CP, state, parent/child links) | +| **MetaIndexBuffer** | Special buffer for index table superblock, inherits from IndexBuffer, always flushed last | +| **Ordinal** | Unique identifier (uint32_t) for each IndexTable within IndexService, used to map buffers to their tables | +| **BlkId** | Block ID - physical location of a node on the virtual device (vdev) | +| **CP (Checkpoint)** | Periodic flush operation that persists all dirty index buffers to disk atomically | +| **DAG (Directed Acyclic Graph)** | Dependency structure where parent buffers depend on child buffers being flushed first | +| **m_wait_for_down_buffers** | Counter tracking how many child buffers must flush before this parent buffer can flush | +| **m_up_buffer** | Pointer from child to parent buffer in the dependency chain | +| **Journal** | Write-ahead log (WAL) recording structural changes (splits/merges) during a CP for crash recovery | +| **Temporary MetaBuffer** | MetaIndexBuffer created during recovery from journal, used only for dependency tracking, has dangling superblock reference | +| **Real MetaBuffer** | MetaIndexBuffer owned by IndexTable (m_sb_buffer), has valid superblock reference, used during normal operation | +| **Committed node** | Node whose m_dirtied_cp_id matches the CP being recovered, indicating it was successfully written before crash | +| **Repair** | Process of fixing B-tree node structure during recovery (fixing links, removing stale children, etc.) | + +## 0.1 Architecture Overview + +## 0.2 Additional Reading + +- [Index B-Tree Nodes (Concepts and On-Disk Layout)](index_btree_nodes.md) +- [Index CP Flush and Recovery (Ordering, Journal, and Root Handling)](index_cp_and_recovery.md) +- [RCA: Root Split Crash Leaves SB Pointing to Old Root](rca_root_split_recovery.md) + + + +### System Context +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HomeStore │ +│ ┌────────────┐ ┌──────────────┐ ┌─────────────────────────┐ │ +│ │ Repl │ │ Log Store │ │ Index Service │ │ +│ │ Service │ │ Service │ │ (Multiple IndexTables) │ │ +│ └────────────┘ └──────────────┘ └───────────┬─────────────┘ │ +│ │ │ +│ ┌─────────────────────▼─────────────┐ │ +│ │ IndexWBCache │ │ +│ │ - Buffer allocation/lifecycle │ │ +│ │ - Dirty tracking │ │ +│ │ - Dependency management (DAG) │ │ +│ │ - CP flush orchestration │ │ +│ └─────────────┬───────────────────┬─┘ │ +│ │ │ │ +│ ┌─────────────▼─────────┐ ┌──────▼──┐│ +│ │ Virtual Device │ │ Meta ││ +│ │ (Block storage) │ │ Service ││ +│ └───────────────────────┘ └─────────┘│ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Key Design Principles + +1. **Write-Back Caching**: Modifications stay in memory until CP flush, enabling batching and reducing I/O +2. **Dependency-Ordered Flush**: Parent nodes flush only after all children, ensuring crash consistency +3. **Copy-on-Write (COW) for CP**: Buffers modified in new CP while old CP is flushing get copied +4. **Journal-Based Recovery**: WAL records structural changes, enabling reconstruction of dependency DAG after crash +5. **Meta Buffer Isolation**: Superblock updates are separate from B-tree node updates, flushed last + +### Problem Being Solved + +**Without WBC**: Every B-tree modification would require: +- Immediate disk write (slow) +- Complex coordination for atomic updates across multiple nodes +- Poor performance for write-heavy workloads + +**With WBC**: +- Batch multiple modifications in memory +- Flush atomically during CP with dependency ordering +- Recover partial flushes using journal +- Achieve both performance and crash consistency + +## 1. Core Components + +### 1.1 IndexWBCache +The write-back cache that manages all index node buffers. It handles: +- Buffer allocation and lifecycle +- Buffer dirty tracking +- Flush orchestration with dependency management + +### 1.2 IndexCPContext +Checkpoint context for a single CP cycle. Manages: +- List of dirty buffers pending flush +- Journal of transactions in this CP +- Completion signaling (Future/Promise) + +### 1.3 IndexBuffer +The buffer structure for each index node. Key members: +- `m_blkid` - Block ID where this buffer is persisted +- `m_dirtied_cp_id` - CP that last modified this buffer +- `m_created_cp_id` - CP when this buffer was created +- `m_state` - CLEAN, DIRTY, or FLUSHING +- `m_up_buffer` - Pointer to parent buffer in dependency chain +- `m_wait_for_down_buffers` - Counter: how many children must flush before this buffer +- `m_is_meta_buf` - Whether this is the meta (super block) buffer +- `m_node_freed` - Whether this node was freed + +### 1.4 MetaIndexBuffer +Special buffer for the index table super block. Inherits from IndexBuffer and: +- Holds a reference to `superblk` +- Must be the last buffer flushed in a CP + +### 1.5 Buffer State Machine + +``` + ┌──────────────┐ + │ CLEAN │ (Initial state after flush or allocation) + └───────┬──────┘ + │ + │ write_buf() called + │ (node modified in current CP) + │ + ┌───────▼──────┐ + │ DIRTY │ (In dirty list, waiting for CP flush) + └───────┬──────┘ + │ + │ CP starts, m_wait_for_down_buffers == 0 + │ do_flush_one_buf() called + │ + ┌───────▼──────┐ + │ FLUSHING │ (Write to disk in progress) + └───────┬──────┘ + │ + │ Write completion callback + │ process_write_completion() called + │ + ┌───────▼──────┐ + │ CLEAN │ (Flush complete, may be evicted from cache) + └──────────────┘ + +Special transitions: +- DIRTY → DIRTY (new CP): If modified again in new CP while old CP flushing +- CLEAN → Evicted: Cache eviction removes buffer from memory +- Any → Recovery: Crash recovery creates temporary buffers from journal +``` + +## 2. Dependency Chain (DAG) + +Buffers form a directed acyclic graph (DAG) based on parent-child relationships in the B-tree: + +``` +Meta Buffer (super block) + │ + ├── Interior Node A + │ │ + │ ├── Leaf Node A1 + │ └── Leaf Node A2 + │ + └── Interior Node B + │ + └── Leaf Node B1 +``` + +### 2.1 Key Rules +1. A child buffer's `m_up_buffer` points to its parent +2. A parent buffer's `m_wait_for_down_buffers` = number of children linked to it +3. **A buffer can only be flushed when `m_wait_for_down_buffers == 0`** +4. After a child flushes, its parent's `m_wait_for_down_buffers` is decremented + +### 2.2 Link Buffer (`link_buf`) +Establishes parent-child relationship between buffers: + +```cpp +void link_buf(IndexBufferPtr up_buf, IndexBufferPtr down_buf, bool is_sibling_link) { + // Remove from old parent if exists + if (down_buf->m_up_buffer) { + down_buf->m_up_buffer->remove_down_buffer(down_buf); + } + + // Link to new parent + down_buf->m_up_buffer = up_buf; + up_buf->add_down_buffer(down_buf); // Increments up_buf->m_wait_for_down_buffers +} +``` + +## 3. Flush Flow + +### 3.1 Triggering Flush +`async_cp_flush()` is called when checkpoint timer fires: + +```cpp +folly::Future IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) +``` + +### 3.2 Steps in Flush + +1. **Journal Update**: Write CP journal to meta block +2. **Prepare Iteration**: Reset iterator for dirty buffer list +3. **Parallel Flush**: Multiple fibers (`m_cp_flush_fibers`) concurrently flush buffers +4. **Dependency Resolution**: Buffers flushed in order: children → parents → meta +5. **Completion**: When all buffers done, call `m_vdev->cp_flush()` then `cp_ctx->complete()` + +### 3.3 Getting Next Buffer to Flush + +`get_next_bufs_internal()` retrieves the next buffer to flush: + +```cpp +// 1. First, try to flush the parent of the just-flushed buffer +if (prev_flushed_buf && prev_flushed_buf->m_up_buffer) { + if (prev_flushed_buf->m_up_buffer->m_wait_for_down_buffers.decrement_testz()) { + // Parent's wait count reached 0, can flush it now + } +} + +// 2. Then, get buffers from the dirty list +// A buffer can be flushed if: +// - State is DIRTY +// - m_dirtied_cp_id == current cp_id +// - m_wait_for_down_buffers == 0 +``` + +### 3.4 Flush One Buffer + +`do_flush_one_buf()` handles individual buffer flush: + +```cpp +void do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr buf, bool part_of_batch) { + buf->set_state(FLUSHING); + + if (buf->is_meta_buf()) { + // Meta buffer: directly update super block + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + process_write_completion(cp_ctx, buf); // Meta completes immediately (no async write) + } else if (buf->m_node_freed) { + // Freed node: no need to write + process_write_completion(cp_ctx, buf); + } else { + // Normal node: async write to vdev + m_vdev->async_write(buf->raw_buffer(), m_node_size, buf->m_blkid) + .thenValue([buf, cp_ctx](auto) { + process_write_completion(cp_ctx, buf); + }); + } +} +``` + +### 3.5 Write Completion + +`process_write_completion()` handles buffer flush completion: + +```cpp +void process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr buf) { + resource_mgr().dec_dirty_buf_size(m_node_size); + + auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); + + if (next_buf) { + do_flush_one_buf(cp_ctx, next_buf, false); + } else if (!has_more) { + // All buffers done, now flush vdev metadata + iomanager.run_on_forget(cp_mgr().pick_blocking_io_fiber(), [this, cp_ctx]() { + m_vdev->cp_flush(cp_ctx); // Blocking IO + cp_ctx->complete(true); + }); + } +} +``` + +### 3.6 Buffer Flush Done + +`on_buf_flush_done_internal()` decrements dependency counters: + +```cpp +std::pair on_buf_flush_done_internal(IndexCPContext* cp_ctx, + IndexBufferPtr buf) { + // Clear debug down_buffers list + buf->m_down_buffers.clear(); + + if (cp_ctx->m_dirty_buf_count.decrement_testz()) { + // Count reached 0, this was the last buffer + buf->set_state(CLEAN); + return {nullptr, false}; + } else { + // Count still > 0, get parent's next buffer + get_next_bufs_internal(cp_ctx, 1u, buf, buf_list); + buf->set_state(CLEAN); + return {buf_list[0], true}; + } +} +``` + +## 4. Key Call Paths + +### 4.1 B-tree Split → Root Change + +When a B-tree root node splits: + +1. `split_root_node()` allocates new root +2. `on_root_changed()` is called: + ```cpp + btree_status_t on_root_changed(BtreeNodePtr new_root, void* context) { + // 1. Update super block with new root info + wb_cache().refresh_meta_buf(m_sb_buffer, cp_ctx); + + // 2. Link new_root → meta and flush in order + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, cp_ctx); + return success; + } + ``` + +3. `transact_bufs()` establishes dependencies: + ```cpp + void transact_bufs(ordinal, parent_buf, child_buf, new_nodes, freed_nodes, cp_ctx) { + if (parent_buf) { + link_buf(parent_buf, child_buf, false); // child → parent + } + // Link new nodes as children of child_buf + for (buf : new_nodes) { + link_buf(child_buf, buf, true); + } + // Add all to dirty list + add_to_dirty_list(...); + } + ``` + +### 4.2 Normal Node Split + +When a non-root node splits: + +1. `split_node()` is called +2. `transact_nodes()` is called with: + - `new_nodes`: [new_sibling_node] + - `left_child_node`: original_node (now left sibling) + - `parent_node`: parent of original_node +3. This creates chain: new_sibling → original_node → parent → ... + +## 5. Meta Buffer Flush Specifics + +The meta buffer has special handling: + +1. **No Async Write**: Unlike normal nodes, meta buffer writes directly to meta service: + ```cpp + if (buf->is_meta_buf()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + process_write_completion(cp_ctx, buf); // Immediate completion + } + ``` + +2. **Must Be Last**: Meta buffer should have `m_wait_for_down_buffers == 0` before flushing, meaning all other buffers (including root) must complete first. + +3. **Super Block Update**: After all buffers flush, `process_write_completion()` calls: + ```cpp + for (ordinal : m_updated_ordinals) { + index_service().write_sb(ordinal); // Write index table super blocks + } + ``` + +## 6. Important Invariants + +1. **Dirty Buffer Count**: `m_dirty_buf_count` must equal the number of dirty buffers pending flush +2. **Dependency Consistency**: If `m_wait_for_down_buffers > 0`, there must be buffers in `m_down_buffers` +3. **Flush Order**: No buffer can be flushed before all its children (`m_wait_for_down_buffers == 0`) +4. **Meta Dependency**: Meta buffer can only be flushed when all other buffers in this CP are done + +## 7. Common Bug Patterns + +### 7.1 Orphaned Down Wait +**Symptom**: Meta buffer has `m_wait_for_down_buffers > 0` but no child's `up` points to it. + +**Cause**: `link_buf()` was called to add a child to meta, but later: +- The child's `up` was changed to point elsewhere +- The child was freed/overwritten without cleaning up the link + +### 7.2 Buffer Overwrite During Flush +**Symptom**: A buffer expected in dirty list is missing, causing dependency chain to break. + +**Cause**: Next CP triggers before current CP completes, and operations in the new CP overwrite buffers from the previous CP. + +### 7.3 Race in Multi-Fiber Flush +**Symptom**: Intermittent failures with `m_dirty_buf_count` mismatch. + +**Cause**: When `m_cp_flush_fibers > 1`, buffers are flushed in parallel. Proper locking in `get_next_bufs_internal()` is required. + +## 8. Recovery Flow + +### 8.1 Recovery Entry Point +`IndexWBCache::recover(sisl::byte_view sb)` called from `IndexService::start()` after all index tables are loaded from meta service. + +### 8.2 Recovery Steps + +1. **Load Buffers from Journal**: `IndexCPContext::recover()` reads journal and reconstructs buffer dependency DAG + - Creates temporary `IndexBuffer` objects for all dirty buffers from prior CP + - Creates temporary `MetaIndexBuffer` for parent buffers marked as meta + - Links buffers with `m_up_buffer` pointers to reconstruct dependency chain + - Performs sanity check to ensure all dependencies are valid (no freed up_buffers, proper ordinals, etc.) + +2. **Two Recovery Passes**: + - **Pass 1**: Handle new nodes and freed nodes (iterate through `bufs` map) + - For new nodes: commit their block IDs if both current and up buffer committed, otherwise remove from up buffer's dependency and prune + - For freed nodes: + - If committed: mark as deleted, write to disk, add up buffer to pending list for repair + - If not committed: either keep (if created before current CP) or delete (if created in current CP), remove from up buffer dependency and prune + - **Pass 2**: Repair parent nodes and pending buffers + - First repair `potential_parent_recovered_bufs` via `parent_recover()` - these are interior nodes that may have stale links + - Then call `recover_buf()` recursively on all pending buffers (buffers whose children were new/freed) + - For uncommitted nodes, call `repair_index_node()` to fix their content + - For committed nodes with meta parent, call `update_root()` to update super block + - Finally repair `pruned_bufs_to_repair` via `repair_index_node()` - these are parents whose dependencies became zero + +3. **Prune Up Buffers**: When a child buffer is removed (e.g., freed), call `prune_up_buffers()` to clean up dependency + - Decrements up buffer's `m_wait_for_down_buffers` via `update_up_buffer_counters()` recursively + - Adds up buffer to `pruned_bufs_to_repair` list if dependency reaches zero + - Adds grand-up buffer to repair list if not meta and also has zero dependency + +4. **Sanity Check**: After recovery, verify all repaired buffers via `index_service().sanity_check()` for each ordinal + +5. **Completion**: `m_in_recovery = false`, temporary buffers destroyed, `m_vdev->recovery_completed()` called + +### 8.3 Recovery vs Normal Operation Differences + +| Aspect | Normal Operation | Recovery | +|------------------|--------------------------------------|-------------------------------------| +| Buffer source | Runtime allocation | Created from journal | +| Meta buffer type | IndexTable::m_sb_buffer (persistent) | Temporary MetaIndexBuffer | +| Write method | Add to dirty list, async CP flush | Immediate sync_write (via write_buf)| +| Buffer lifecycle | Cached in memory, may be evicted | Temporary, destroyed after recovery | +| Cache usage | Buffers inserted into cache | Cache operations skipped | + +## 9. Meta Buffer Dual Paths + +### 9.1 Path 1: Normal Operation (Persistent Path) + +**Entry**: `IndexTable::on_root_changed()` + +**Flow**: +1. `IndexTable::m_sb_buffer` (persistent, lives with IndexTable object) +2. Update `m_sb` fields (root_node, root_link_version, btree_depth, node counts) +3. `refresh_meta_buf(m_sb_buffer, cp_ctx)` - creates new meta buffer via COW if needed (wb_cache.cpp:200-220) + - If `m_dirtied_cp_id > cp_ctx->id()`: return false (CP mismatch) + - If `m_dirtied_cp_id == cp_ctx->id()`: copy superblk to buffer via `copy_sb_to_buf()` + - Else: create new `MetaIndexBuffer` via COW, set `m_dirtied_cp_id = cp_ctx->id()`, add to dirty list +4. `transact_bufs(ordinal, m_sb_buffer, root_buf, {}, {})` - establishes root→meta dependency +5. `link_buf(m_sb_buffer, root_buf, false)` - sets `root_buf->m_up_buffer = m_sb_buffer` +6. CP flush: `do_flush_one_buf()` writes to metadata service via `meta_service().update_sub_sb()` + +**Key**: This path uses the IndexTable's real meta buffer reference. + +### 9.2 Path 2: Recovery (Temporary Path) + +**Entry**: `IndexCPContext::process_txn_record()` during recover + +**Flow**: +1. Create temporary `MetaIndexBuffer` with empty local `superblk< index_table_sb > tmp_sb` (dangling reference bug - see Bug 11.2) +2. `m_up_buffer = temporary_meta` - set on child buffers during `rec_to_buf()` lambda +3. `m_wait_for_down_buffers++` - counts pending children via `add_down_buffer()` +4. Purpose: Only for dependency tracking, ensuring correct repair order (children before parents) +5. Lifecycle: Created in `process_txn_record()`, stored in `buf_map`, destroyed when no ref exists + +**Key**: This path creates temporary meta buffers that are never flushed. During recovery: +- `repair_root_node()` does NOT update `m_sb->root_node` in the superblock +- Instead, it fixes the old root node to point to new root via edge_value (if root changed) +- The superblock continues to point to the old root, which now has a valid edge to the new root +- Superblock updates only happen during normal operation via `on_root_changed()` → `m_sb->root_node = new_root->node_id()` + +## 10. Root Change Flow + +### 10.1 Root Change from B-tree Operations + +**Trigger**: Split, merge, or other operations that create new root + +**Flow**: +``` +Btree operation (split/merge) + └─> IndexTable::on_root_changed(new_root, cp_ctx) + ├─> Update m_sb->root_node, depth, node counts + ├─> wb_cache().refresh_meta_buf(m_sb_buffer, cp_ctx) + │ └─> May create new m_sb_buffer via COW + │ + └─> wb_cache().transact_bufs(ordinal, m_sb_buffer, root_buf, {}, {}, cp_ctx) + └─> wb_cache().link_buf(m_sb_buffer, root_buf, false, cp_ctx) + ├─> root_buf->m_up_buffer = m_sb_buffer ✅ + ├─> m_sb_buffer->add_down_buffer(root_buf) + └─> add_to_txn_journal() to record operation +``` + +**Result**: Root node now has `m_up_buffer = m_sb_buffer`, establishing flush order dependency. + +### 10.2 Root Change During Recovery + +**Trigger**: Recovery detects root change from journal + +**Flow**: +``` +recover_buf(root_buf) // wb_cache.cpp:788-812 + └─> If all down buffers flushed (m_wait_for_down_buffers == 0): + ├─> If !was_node_committed(buf): + │ └─> index_service().repair_index_node(ordinal, buf) + │ + └─> If was_node_committed(buf) && m_up_buffer && m_up_buffer->is_meta_buf(): + └─> index_service().update_root(ordinal, buf) + └─> table->repair_root_node(buf) + ├─> Check if buf->blkid() == m_sb->root_node (is this the old root?) + ├─> If yes, get edge_id from node->next_bnode() + ├─> Clear next_bnode and set edge_value to edge_id + └─> write_node_impl() to persist the fixed root node (not superblock) +``` + +**Key**: When root changes during recovery, `update_root()` is called for committed roots with meta parent. **Important**: This does NOT update the superblock. Instead: +- It calls `repair_root_node()` which checks if the buffer is the old root +- If yes, it reads the new root ID from `node->next_bnode()` +- It then clears next_bnode and sets edge_value to point to the new root +- The old root node is written back with the edge updated +- The superblock (`m_sb->root_node`) is NOT changed - it still points to the old root +- The btree will follow the edge from old root to find the new root during normal access +- The superblock is only updated during normal operation when `on_root_changed()` is called + +### 10.3 Setting Root's Up Buffer + +**Normal Path** (only on root change): +- `on_root_changed()` → `transact_bufs()` → `link_buf()` sets `root_buf->m_up_buffer = m_sb_buffer` +- Done only when root object changes (new root node created), not for normal root node updates +- `link_buf()` also handles the case where up_buffer was created in same CP (uses real_up_buf = up_buf->m_up_buffer) + +**Recovery Path**: +- Sets `root_buf->m_up_buffer` to temporary meta buffer during `process_txn_record()` +- Temporary meta buffer reference has dangling superblk, but only used as pointer for dependency tracking +- After recovery, these temporary buffers and dependencies are discarded + +## 11. Known Bugs + +### 11.1 Meta Buffer Prune Bug (FIXED) + +**Location**: `IndexWBCache::prune_up_buffers()` + +**Trigger Scenario**: An IndexTable (ordinal=7) was destroyed (e.g., deleted or failed mid-initialization), but the journal from a prior CP still contains records for it. During recovery: +1. Journal is replayed — a temporary `MetaIndexBuffer` is created for ordinal=7 +2. The child buffer (which was a new/uncommitted node for ordinal=7) is found to be uncommitted +3. The child is removed from the dependency chain, triggering `prune_up_buffers()` +4. The temporary MetaIndexBuffer (ordinal=7's `up_buf`) gets unconditionally added to `pruned_bufs_to_repair` +5. Later, `repair_index_node(ordinal=7, meta_buf)` is called → `get_index_table(7)` returns nullptr → **CRASH** + +The key insight: `prune_up_buffers()` is only called during recovery when discarding uncommitted nodes. A temporary MetaIndexBuffer is never a real B-tree node — it's a placeholder for dependency tracking only and must never be repaired. + +**Buggy Code** (wb_cache.cpp:533-550): +```cpp +void IndexWBCache::prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& pruned_bufs_to_repair) { + auto up_buf = buf->m_up_buffer; + auto grand_up_buf = up_buf->m_up_buffer; + if (!up_buf || !up_buf->m_wait_for_down_buffers.testz()) { return; } + + update_up_buffer_counters(up_buf); + + pruned_bufs_to_repair.push_back(up_buf); // ❌ BUG: no is_meta_buf() check + if (grand_up_buf && !grand_up_buf->is_meta_buf() && grand_up_buf->m_wait_for_down_buffers.testz()) { + pruned_bufs_to_repair.push_back(grand_up_buf); // ✅ correctly skips meta + } +} +``` + +**Fix**: Add the same `is_meta_buf()` guard that already exists for `grand_up_buf`: +```cpp + if (!up_buf->is_meta_buf()) { + pruned_bufs_to_repair.push_back(up_buf); + } +``` + +**Why This Is Safe**: Temporary MetaIndexBuffers are dependency tracking placeholders created from journal records with empty superblocks. They are never real B-tree nodes and carry no state that needs repairing. Real superblock state lives in `IndexTable::m_sb_buffer` and is restored through `update_root()` / `IndexTable::write_sb()`, not through the repair path. + +### 11.2 Dangling Reference in Temporary Meta Buffers + +**Location**: `IndexCPContext::process_txn_record()` + +**Bug**: Temporary meta buffers hold reference to local `superblk` variable that goes out of scope. + +**Current Code** (index_cp.cpp:328-375): +```cpp +void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, IndexBufferPtr >& buf_map) { + auto cpg = cp_mgr().cp_guard(); + + auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, bool is_meta, BlkId const& bid, + IndexBufferPtr const& up_buf) -> IndexBufferPtr { + IndexBufferPtr buf; + auto it = buf_map.find(bid); + if (it == buf_map.end()) { + if (is_meta) { + superblk< index_table_sb > tmp_sb; // ❌ Local variable + buf = std::make_shared< MetaIndexBuffer >(tmp_sb); // ❌ Dangling reference + } else { + buf = std::make_shared< IndexBuffer >(nullptr, bid); + } + // ... + } + // ... + }; + // tmp_sb destroyed here, but buf->m_sb still references it +} +``` + +**Impact**: Currently not triggered in normal recovery flow because: +1. Recovery never calls `write_buf()` on temporary meta buffers (only on freed real nodes at line 622) +2. The only access to `m_sb` is in `write_buf()` at line 130-132 when calling `meta_service().update_sub_sb()` +3. Recovery uses meta buffers purely as dependency tracking pointers + +**Why Not Crashing**: The dangling reference exists but is never dereferenced during normal recovery. + +**Status**: Latent bug - if future code tries to flush/write temporary meta buffers during recovery, it would crash with segfault or data corruption. + +### 11.3 Root Up Buffer Lifecycle + +**Behavior**: Root nodes only get `m_up_buffer` set during explicit root change via `on_root_changed()`, not on regular updates. + +**Scenario**: +1. Recovery completes, root loaded but temporary meta buffer and dependencies are discarded +2. First modification of root after recovery: buffer becomes dirty and flushes without meta dependency +3. Root change (new root node created): `on_root_changed()` called, establishes root→meta dependency + +**Current Behavior**: By design - meta dependency only needed when superblock changes (root_node, depth, link_version update). Regular root modifications don't require superblock update, so no meta dependency needed. + +## 13. Configuration + +- `m_cp_flush_fibers`: Number of fibers for parallel buffer flushing (default: 1) +- `m_node_size`: Size of each index node buffer +- Cache capacity controlled by `sisl::SimpleCache` \ No newline at end of file diff --git a/docs/structures/index_btree_nodes.md b/docs/structures/index_btree_nodes.md new file mode 100644 index 000000000..a532f7851 --- /dev/null +++ b/docs/structures/index_btree_nodes.md @@ -0,0 +1,177 @@ +# Index B-Tree Nodes (Concepts and On-Disk Layout) + +This document explains the **node model** used by HomeStore's index B-tree implementation, focusing on: + +- What *root / interior / leaf* mean +- The meaning of *level* +- The meaning of *edge* vs *next_node* +- The persistent on-disk header (`persistent_hdr_t`) +- Key structural invariants (what the sanity check validates) + +This is meant as background for understanding checkpoint (CP) flush and recovery. + +--- + +## 1. Node types and levels + +HomeStore stores an index as a B-tree where every node has a **level**: + +- **Leaf node**: `level == 0` + - Stores *key -> user value* entries. +- **Interior node**: `level > 0` + - Stores *separator keys* and *child pointers* (as `BtreeLinkInfo`). + +The **root** is simply the node currently referenced by the index table superblock (`index_table_sb.root_node`). +Depending on tree size, root may be a leaf (small tree) or an interior node. + +### 1.1 Level invariant +For every parent/child relationship: + +- `child.level == parent.level - 1` + +This is the invariant that failed in the reported incident. + +--- + +## 2. Persistent vs transient headers + +Each node buffer consists of: + +- A **persistent header** stored on disk (`persistent_hdr_t`) +- A **node body** (key/value entries) +- A **transient header** stored only in memory (`transient_hdr_t`) used for locking and runtime thresholds + +### 2.1 Persistent header (`persistent_hdr_t`) +Source: `src/include/homestore/btree/detail/btree_node.hpp` + +`persistent_hdr_t` is placed at the start of every node buffer (packed layout). Important fields: + +| Field | Meaning | +|---|---| +| `magic`, `version`, `checksum` | On-disk format verification (`BtreeNode::is_valid_node`) | +| `nentries` | Number of entries in the node body | +| `leaf` | 1-bit flag: leaf vs interior | +| `node_deleted` | 1-bit tombstone; deleted nodes may still exist on disk during recovery | +| `node_id` | Persistent node identifier (`bnodeid_t`); for index this is the block id | +| `next_node` | Persistent sibling pointer (`next_bnode()`) | +| `node_gen` | Generation number; incremented on updates (used by lock/refresh logic) | +| `link_version` | Version of the link from parent -> this node (`BtreeLinkInfo`) | +| `edge_info` | The rightmost child pointer + link_version (see section 3) | +| `modified_cp_id` | CP id of last modification; used by recovery to decide whether a node was committed | +| `level` | Node level within the tree | +| `node_size` | Physical size of the node buffer | +| `node_type` | Node layout variant (simple/prefix/varlen) | + +You can print these fields directly from a core dump by casting the node buffer: + +```gdb +set $hdr = (homestore::persistent_hdr_t*)node->m_phys_node_buf +p *$hdr +``` + +--- + +## 3. Child pointers: entries vs edge + +Interior nodes contain child pointers in two forms: + +1. **Regular children** stored as the value associated with each separator key in the node body. +2. The **edge child** stored separately in `persistent_hdr_t::edge_info`. + +### 3.1 What is the edge child? +For an interior node, imagine the children as: + +``` +C0, C1, C2, ... , Cn +``` + +and separator keys: + +``` +K0, K1, ... , K(n-1) +``` + +The node body stores `(Ki -> Ci)` for `i in [0..n-1]`. +The rightmost child `Cn` is stored separately as **edge**. + +In code, this is handled by treating index `total_entries()` as a valid child index only when edge is present. + +Key call sites: + +- `BtreeNode::find(...)` (child selection) +- `Btree::get_child_and_lock_node(...)` (read child at index) + +### 3.2 `has_valid_edge()` +Definition: + +- For leaf nodes: always false +- For interior nodes: true if `edge_id() != empty_bnodeid` + +So an **"edge node"** in logs means: an interior node with `has_valid_edge() == true`. + +--- + +## 4. Sibling linkage: `next_node` / `next_bnode()` + +`persistent_hdr_t::next_node` is a persistent forward pointer: + +- For **leaf nodes**: links leaves in key order to support range scans. +- For **interior nodes**: used to maintain ordering/relationship expectations among children (and validated by sanity checks). + +### 4.1 How split updates sibling pointers +On split, the code does: + +- new right node takes old `next_bnode` +- old (left) node's `next_bnode` updated to point to the new right node + +Source: `src/include/homestore/btree/detail/btree_mutate_impl.ipp` (`split_node`). + +--- + +## 5. Structural invariants validated by sanity checks + +The validation functions live in: + +- `src/include/homestore/btree/detail/btree_common.ipp` + +Key invariants relevant to the incident: + +1. **Level invariant** + - `child.level == parent.level - 1` + +2. **Leaf nodes cannot have edge children** + +3. **A node cannot have both a valid edge and a non-empty next_node** + - (validation expects `has_valid_edge() -> next_node == empty_bnodeid`) + +4. **Child sibling linkage** + - When iterating children of a parent, validation expects: + - `previous_child.next_bnode == current_child.node_id` + +5. **Key ordering in node body** + +These invariants are what make the flush/recovery mechanisms safe: parent pointers and ordering assumptions must be consistent. + +--- + +## 6. Notes on terminology in logs + +- **LIVE**: `node_deleted == 0` +- **Deleted**: `node_deleted == 1` +- **LEAF / INTERIOR**: from the persistent `leaf` bit +- **level=N**: from persistent header +- **edge=X.Y**: `edge_info.m_bnodeid = X` and `edge_info.m_link_version = Y` +- **next=...**: `next_node` is not empty + +--- + +## 7. Pointers to code + +- Persistent header definition and accessors: + - `src/include/homestore/btree/detail/btree_node.hpp` + +- Validation logic: + - `src/include/homestore/btree/detail/btree_common.ipp` + +- Split logic updating next pointers: + - `src/include/homestore/btree/detail/btree_mutate_impl.ipp` diff --git a/docs/structures/index_cp_and_recovery.md b/docs/structures/index_cp_and_recovery.md new file mode 100644 index 000000000..c4a77fda6 --- /dev/null +++ b/docs/structures/index_cp_and_recovery.md @@ -0,0 +1,244 @@ +# Index CP Flush and Recovery (Ordering, Journal, and Root Handling) + +This document explains: + +- How IndexWBCache tracks dirty index nodes +- How dependency ordering (DAG) is built +- What the CP transaction journal records and its ordering +- How recovery reconstructs the DAG and decides what to repair +- When index superblocks (SB) are updated + +It is intended to provide enough context to understand crash-consistency issues around root changes. + +--- + +## 1. Key components + +### 1.1 IndexWBCache +The write-back cache that owns `IndexBuffer` objects and orchestrates CP flush. + +Source: +- `src/lib/index/wb_cache.cpp` +- `src/lib/index/wb_cache.hpp` + +### 1.2 IndexBuffer +A buffer representing a btree node. +It tracks: + +- Block id (`m_blkid`) +- State (`CLEAN`, `DIRTY`, `FLUSHING`) +- CP ids (`m_created_cp_id`, `m_dirtied_cp_id`) +- Dependency links: + - `m_up_buffer` (child -> parent) + - `m_wait_for_down_buffers` (parent wait-count) + +### 1.3 MetaIndexBuffer +Special buffer that represents the index table superblock (SB) during normal operation. + +Important: during **recovery**, temporary meta buffers can be created from the txn journal, and are used only for dependency tracking. + +--- + +## 2. How dependency ordering works (DAG) + +IndexWBCache enforces a flush order: + +- Children must flush before parents. + +This is encoded as: + +- `down_buf->m_up_buffer = up_buf` +- `up_buf->add_down_buffer(down_buf)` which increments `up_buf->m_wait_for_down_buffers` + +Source: +- `IndexWBCache::link_buf(...)` in `src/lib/index/wb_cache.cpp` + +A buffer can be flushed only when: + +- `state == DIRTY` +- `m_dirtied_cp_id == current_cp` +- `m_wait_for_down_buffers == 0` + +Source: +- `IndexWBCache::get_next_bufs_internal(...)` in `src/lib/index/wb_cache.cpp` + +--- + +## 3. Mutation commit path: `write_node_impl` and `transact_bufs` + +### 3.1 Dirtying nodes +When a btree node is modified, `IndexTable::write_node_impl` is called. + +Key behavior: + +- Marks `IndexBuffer` state to `DIRTY` +- Sets persistent header `modified_cp_id` +- Adds buffer into CP dirty list via `wb_cache().write_buf(...)` + +Source: +- `IndexTable::write_node_impl` in `src/include/homestore/index/index_table.hpp` + +### 3.2 Transaction boundary +Structural operations (split/merge) call `IndexTable::transact_nodes(...)`. + +This does: + +1. `write_node_impl` for: + - newly created nodes + - left child + - parent (if any) +2. `wb_cache().transact_bufs(...)` to link dependencies and append to journal + +Source: +- `IndexTable::transact_nodes` in `src/include/homestore/index/index_table.hpp` + +--- + +## 4. Root changes and `on_root_changed` + +When the root changes, `IndexTable::on_root_changed(new_root)` is called. + +It: + +- Updates in-memory SB fields (`index_table_sb`) including `root_node` and `btree_depth` +- Ensures the meta buffer is writable (`refresh_meta_buf`) +- Calls `wb_cache().transact_bufs(meta_buf, root_buf, {}, {})` + - This creates a dependency `meta -> root` and also appends a **meta/root transaction record** to the journal. + +Source: +- `IndexTable::on_root_changed` in `src/include/homestore/index/index_table.hpp` + +--- + +## 5. Transaction journal (what it records and ordering) + +The journal is managed by `IndexCPContext`. + +The record ordering is deterministic: + +1. Parent (in-place) buffer id (optional; can be meta) +2. Child (in-place) buffer id (optional) +3. Newly created child buffers (0..N) +4. Freed child buffers (0..N) + +This ordering is enforced in `txn_record::append(...)`. + +Source: +- `src/lib/index/index_cp.hpp` +- `src/lib/index/index_cp.cpp` (`IndexCPContext::add_to_txn_journal`) + +### 5.1 How meta/root transactions are recorded +In `IndexWBCache::transact_bufs`: + +- When `new_node_bufs` and `freed_node_bufs` are empty, it is treated as a **meta/root transaction**. +- It appends a txn record that includes meta as parent and root as child. + +Source: +- `IndexWBCache::transact_bufs` in `src/lib/index/wb_cache.cpp`. + +--- + +## 6. CP flush ordering and SB update timing + +### 6.1 Flush start +`IndexWBCache::async_cp_flush(cp_ctx)` does: + +1. Writes the txn journal into meta-service first (`update_sub_sb` / `add_sub_sb`). +2. Starts flush fibers that select eligible buffers from the dirty list. + +Source: +- `IndexWBCache::async_cp_flush` in `src/lib/index/wb_cache.cpp` + +### 6.2 Buffer writes +`do_flush_one_buf`: + +- Meta buffers are written to meta service and completed immediately. +- Normal buffers are async-written to vdev. + +Source: +- `IndexWBCache::do_flush_one_buf` in `src/lib/index/wb_cache.cpp` + +### 6.3 When SB is written +After the last dirty buffer completes, `process_write_completion` calls: + +- `index_service().write_sb(ordinal)` for each updated ordinal + +This is an important detail: + +> **Index table SB persistence is performed at the end of CP flush, after all buffers complete.** + +Source: +- `IndexWBCache::process_write_completion` in `src/lib/index/wb_cache.cpp` + +--- + +## 7. Recovery flow overview + +Entry point: + +- `IndexWBCache::recover(sb)` called from `IndexService::start()`. + +High-level steps: + +1. Set `m_in_recovery = true`. +2. Reconstruct DAG from persisted journal: + - `IndexCPContext::recover(...)` returns a map of buffers with `m_up_buffer` links. +3. Two-pass processing: + - Pass 1: handle new/freed nodes (commit/free decisions) + - Pass 2: repair needed parents and recursively repair up-buffers (`recover_buf`) +4. Run sanity check +5. Exit recovery mode + +Source: +- `IndexWBCache::recover` and `recover_buf` in `src/lib/index/wb_cache.cpp` +- `IndexCPContext::recover` in `src/lib/index/index_cp.cpp` + +### 7.1 What "committed" means in recovery +Recovery uses the node's persistent `modified_cp_id`: + +- `was_node_committed(buf)` returns true when: + - node header is valid + - `buf->m_dirtied_cp_id == current_cp_id` + +The header field is read via `BtreeNode::get_modified_cp_id`. + +--- + +## 8. Root handling during recovery + +During `recover_buf(buf)`: + +- If the node is committed and `buf->m_up_buffer` is a meta buffer, recovery calls: + - `index_service().update_root(ordinal, buf)` + - which calls `IndexTable::repair_root_node(buf)` + +`repair_root_node` is intended to fix cases where SB still points to an old root by converting a root-change marker into `edge_info`. + +Source: +- `IndexWBCache::recover_buf` in `src/lib/index/wb_cache.cpp` +- `IndexTable::repair_root_node` in `src/include/homestore/index/index_table.hpp` + +--- + +## 9. Important debug signal: CLEAN buffer dependency warning + +`IndexWBCache::link_buf` logs when a CLEAN down buffer is added as a dependency: + +- `CLEAN_BUF_DEBUG: Adding CLEAN down_buf ... to up_buf ...` + +This warns about a potential CP hang scenario because CLEAN buffers do not normally participate in the CP dirty list. + +Source: +- `IndexWBCache::link_buf` in `src/lib/index/wb_cache.cpp` + +--- + +## 10. Relevant code pointers + +- Flush: `src/lib/index/wb_cache.cpp` + - `async_cp_flush`, `do_flush_one_buf`, `process_write_completion`, `get_next_bufs_internal` + +- Journal: `src/lib/index/index_cp.hpp`, `src/lib/index/index_cp.cpp` + +- Root SB update: `src/include/homestore/index/index_table.hpp` + - `on_root_changed`, `write_node_impl`, `transact_nodes`, `repair_root_node` diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 0d90cfa9d..29bcc67a2 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -863,6 +863,69 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { } } +// Regression test for the root-split crash bug (SDSTOR-XXXXX): +// During a root split (tree height N→N+1), crash_flush_on_meta fires after +// all node buffers (new_root, old_root, child_node2) are written to disk but +// before meta_buf (SB) writes. On recovery, repair_root_node reads +// old_root.next_bnode (= child_node2.blkid, a level-N interior node) and +// writes it into old_root.edge_info, violating the invariant +// child.level == parent.level - 1. +// +// Two-phase design: +// Phase 1: trigger the 1st root split (leaf → level=1) cleanly so the tree +// reaches a stable level=1 state with an interior old_root. +// Phase 2: set crash_flush_on_meta, insert enough entries to trigger the 2nd +// root split (level=1 → level=2), then crash at meta_buf flush. +// +// Expected outcome after fix: recovery succeeds and the tree is consistent. +// Without fix: repair_root_node writes child_node2 (level=1) as edge of +// old_root (level=1) → validate_node asserts "child level mismatch". +TYPED_TEST(IndexCrashTest, CrashAtMetaBufOnSecondRootSplit) { + const uint32_t max_keys = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + + // Phase 1: insert enough entries to trigger the first root split (leaf → level=1) + // and flush cleanly so the tree is at a stable level=1 before the crash CP. + LOGINFO("Phase 1: Insert {} entries to trigger first root split (leaf→level=1)", max_keys + 1); + for (uint32_t k = 0; k <= max_keys; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + LOGINFO("Phase 1: Flush cleanly — tree is now at level=1, SB is up to date"); + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + // Phase 2: set crash_flush_on_meta before inserts so it fires when the 2nd + // root split calls transact_bufs(meta_buf, new_root_buf). The flip marks + // meta_buf with crash_flag; the actual crash fires only when meta_buf is + // ready to write (after all node bufs complete), leaving the SB stale. + LOGINFO("Phase 2: Set crash_flush_on_meta flip"); + this->set_basic_flip("crash_flush_on_meta"); + + // max_keys^2 entries is sufficient to fill the level=1 root (which requires + // ~max_keys child splits, each needing ~max_keys inserts to fill a leaf). + const uint32_t phase2_count = max_keys * max_keys; + LOGINFO("Phase 2: Insert {} entries to trigger second root split (level=1→2)", phase2_count); + for (uint32_t k = max_keys + 1; k <= max_keys + phase2_count; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + + LOGINFO("Phase 2: Trigger CP (crash expected at meta_buf flush)"); + test_common::HSTestHelper::trigger_cp(false); + LOGINFO("Phase 2: Waiting for crash and recovery"); + this->wait_for_crash_recovery(true); + + // Disk state after crash (verified against gdb evidence from the production incident): + // - SB: root_node=old_root (level=1), btree_depth=1 [meta_buf not written] + // - old_root on disk: edge_info=EMPTY, next_bnode=child_node2.blkid + // - new_root (level=2) on disk [written before meta_buf] + // + // Without fix: repair_root_node sets old_root.edge_info=child_node2 (level=1) + // → validate_node aborts: "child level: 1, expected: 0" + // With fix: recovery finds new_root correctly; tree is consistent. + LOGINFO("Recovery complete — reapplying and verifying tree integrity"); + this->reapply_after_crash(); + this->get_all(); +} + TYPED_TEST(IndexCrashTest, long_running_put_crash) { long_running_crash_options crash_test_options{ .put_freq = 100,