From 6fe0959821aa5adb012128ee04de669ab338c2cf Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 18 Jun 2026 17:13:47 +0800 Subject: [PATCH] SDSTOR-22293: handle logdev recovery corner case also add relevant knowledge to docs/structures to support AI in future triaging --- conanfile.py | 2 +- ...ogdev-truncate-chunk-meta-inconsistency.md | 142 +++ docs/structures/raft.md | 1031 +++++++++++++++++ docs/structures/raft_repl_dev_log_dev.md | 532 +++++++++ src/lib/device/journal_vdev.cpp | 15 +- src/lib/device/journal_vdev.hpp | 14 +- src/lib/logstore/log_dev.cpp | 56 +- src/tests/test_log_dev.cpp | 134 +++ 8 files changed, 1922 insertions(+), 4 deletions(-) create mode 100644 docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md create mode 100644 docs/structures/raft.md create mode 100644 docs/structures/raft_repl_dev_log_dev.md diff --git a/conanfile.py b/conanfile.py index 87a52abba..4ec340dfb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "7.5.10" + version = "7.5.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md b/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md new file mode 100644 index 000000000..8367dee6f --- /dev/null +++ b/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md @@ -0,0 +1,142 @@ +# RCA: LogDev Crash on Restart — Truncation Chunk/Meta Inconsistency + +**Date:** 2026-05-22 +**Component:** HomeStore / LogStore / JournalVirtualDev +**Severity:** Critical (crash on restart, blocks recovery) + +--- + +## Symptom + +After a SIGKILL, the process crashes during restart at `log_dev.cpp:234`: + +``` +Assertion failure: Expected '473577' to be == to '460780' +log indx is not the expected one +``` + +`do_load` reads the first log group from the persisted start dev_offset +(`0x54BC8000`) and finds `start_idx=473577`, but the logdev superblock says +`log_idx=460780`. + +--- + +## Root Cause Chain + +### Condition 1: `m_vdev_jd->truncate()` persists chunk metadata synchronously + +`JournalVirtualDev::Descriptor::truncate()` calls `update_chunk_private()` for +every chunk it modifies. The call chain: + +``` +update_chunk_private() + → chunk->set_user_private() + → write_chunk_info() + → physical_dev->write_super_block() + → m_drive_iface->sync_write() ← synchronous disk write +``` + +Specifically, truncation: +1. Marks the new head chunk `is_head=true` — **sync_write** +2. Calls `release_chunk_to_pool()` on each removed chunk, which clears its + private data — **sync_write** per chunk + +All writes complete before `truncate()` returns. +**Evidence:** `physical_dev.cpp:122` — `sync_write` call, no async path. + +### Condition 2: `m_logdev_meta.persist()` is called after `m_vdev_jd->truncate()` returns + +`LogDev::truncate()` (`log_dev.cpp:665–688`): + +```cpp +m_vdev_jd->truncate(min_safe_ld_key.dev_offset); // chunk state durable + +// ← SIGKILL WINDOW OPENS HERE + +m_logdev_meta.set_start_dev_offset(..., false); // in-memory only +m_logdev_meta.unreserve_store(..., false); // in-memory only +m_logdev_meta.remove_rollback_record_upto(..., false); // in-memory only + +m_logdev_meta.persist(); // ← logdev superblock written; never reached +``` + +When `stopping=false` (normal operation), all intermediate calls use +`persist_now=false`. Only the final `persist()` writes to disk. +**Evidence:** code structure, `log_dev.cpp:665–688`. + +### Condition 3: On restart, stale logdev meta is interpreted against a new chunk list + +`JournalVirtualDev::init()` rebuilds the chunk list from on-disk chunk private +data (not from logdev meta). After the truncation in Condition 1, the new head +chunk is persisted. On restart: + +- **Chunk list (from disk):** `[Chunk B (is_head=true), ...]` + (Chunk A was released and its private data cleared/persisted to disk) +- **Logdev meta (from disk):** `(dev_offset=0x54BC8000, log_idx=460780)` + (old value, persist never ran) + +`LogDev::start()` calls: +```cpp +m_vdev_jd->update_data_start_offset(0x54BC8000); // set from stale meta +m_log_idx = 460780; +do_load(0x54BC8000); +``` + +### Condition 4: `offset_to_chunk(0x54BC8000)` maps to Chunk B in the new list + +`journal_vdev.cpp:727`: +```cpp +chunk_aligned_offset = round_down(m_data_start_offset, chunk_size); +off_l = 0x54BC8000 - chunk_aligned_offset; +// Iterates [Chunk B, ...] → maps to Chunk B at offset off_l +``` + +The old offset `0x54BC8000` is now interpreted against the new chunk list. It +maps to a physical position inside Chunk B, where log data starting at +`start_idx=473577` resides. + +### Condition 5: `do_load` has no handler for `header->start_idx() > m_log_idx` + +`log_dev.cpp:225–234`: +```cpp +if (loaded_from == -1 && header->start_idx() < m_log_idx) { + // handles: journal fully truncated → break +} +// No handling for start_idx > m_log_idx ← the actual case (473577 > 460780) +HS_REL_ASSERT_EQ(header->start_idx(), m_log_idx.load(), ...); // CRASH +``` + +The assert fires because the gap case (truncation advanced further than what +is recorded in meta) is unhandled. + +--- + +## Trigger Conditions + +The following sequence must all occur: + +1. LogDev has at least two chunks in its journal chunk list +2. A truncation is triggered (raft compact or resource pressure) where + `min_safe_ld_key.dev_offset` falls in chunk N+1 or later (i.e., the + truncation point crosses a chunk boundary, causing chunk N to be released) +3. SIGKILL arrives after `m_vdev_jd->truncate()` returns but before + `m_logdev_meta.persist()` executes + +The window is narrow (microseconds to low milliseconds) but non-zero, and +SIGKILL is unblockable. + +**Observed instance (2026-05-10):** +- `log_dev=1`, pre-kill flush at `log_idx=490277` +- Truncation with `min_safe_ld_key.idx=475195` +- Persisted meta: `(dev_offset=0x54BC8000, log_idx=460780)` +- First log in new chunk list: `start_idx=473577` + +--- + +## Skipped Conditions + +None. All four conditions confirmed via static code analysis and log evidence. +The chunk list change (Condition 3) is proven by the crash itself via proof by +contradiction: if the chunk list had not changed, `offset_to_chunk(0x54BC8000)` +would map to the original Chunk A and recover log 460780 correctly; the +observed mismatch is impossible without a chunk list change. diff --git a/docs/structures/raft.md b/docs/structures/raft.md new file mode 100644 index 000000000..9f5297a34 --- /dev/null +++ b/docs/structures/raft.md @@ -0,0 +1,1031 @@ +# RAFT Replication Log Management + +This document provides a detailed explanation of RAFT log management in HomeStore, covering the architecture, append flow, compact/truncate flow, and rollback flow. It serves as a reference for code understanding and issue analysis. + +## Architecture Overview + +### Component Stack and Responsibilities + +``` +┌─────────────────────────────────────────────────────────────┐ +│ NuRaft (nuraft library) │ +│ - Manages RAFT consensus protocol │ +│ - Calls log_store interface to persist/retrieve logs │ +└─────────────────────────────────────────────────────────────┘ + │ + │ nuraft::log_store interface + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ ReplLogStore │ +│ - Implements nuraft::log_store interface │ +│ - Manages repl_req lifecycle and LSN mapping │ +│ - Coordinates data channel and raft channel │ +│ - Inherits from HomeRaftLogStore │ +└─────────────────────────────────────────────────────────────┘ + │ + │ Delegates to parent class + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ HomeRaftLogStore │ +│ - Converts between RAFT LSN and LogStore LSN │ +│ - Serializes/deserializes nuraft log entries │ +│ - Caches log entries for fast access │ +│ - Contains m_log_store (HomeLogStore) │ +└─────────────────────────────────────────────────────────────┘ + │ + │ Uses HomeLogStore for storage + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ HomeLogStore │ +│ - Manages log sequence numbers (LSNs) │ +│ - Tracks logdev_key for each LSN │ +│ - Maintains truncation boundaries │ +│ - One logstore per RAFT log │ +└─────────────────────────────────────────────────────────────┘ + │ + │ Writes to LogDev + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ LogDev │ +│ - Physical log device managing actual disk writes │ +│ - Batches multiple log records into LogGroups │ +│ - Assigns logdev_key {log_idx, dev_offset} to each record │ +│ - Currently: one LogDev per HomeLogStore (1:1 mapping) │ +│ - Performs physical truncation on journal vdev │ +└─────────────────────────────────────────────────────────────┘ +``` + +### LSN and Key Concepts + +#### LSN (Log Sequence Number) Mapping +- **RAFT LSN** (repl_lsn): Log index in RAFT, starts from 1 +- **LogStore LSN** (store_lsn): Sequence number in HomeLogStore = `repl_lsn - 1` +- **Conversion formula**: `store_lsn = repl_lsn - 1` +- **Example**: RAFT LSN 100 → LogStore LSN 99 + +#### LogDev Key Structure +```cpp +struct logdev_key { + logid_t idx; // Log index in LogDev + off_t dev_offset; // Physical offset on journal device +} +``` +- Each log written to LogDev gets a unique logdev_key +- The `idx` is assigned sequentially by LogDev, independent of store_lsn +- The `dev_offset` is the actual byte offset on the physical journal device + +#### Replication Request (repl_req) +Represents a single write operation in the replication pipeline: +- **Contents**: user header, key, data buffer, LSN +- **State tracking**: DATA_WRITTEN, LOG_FLUSHED, etc. +- **Creation timing**: + - **Leader**: Created in `async_alloc_write` before proposing to RAFT + - **Follower**: Created when first receiving the entry, via two possible paths: + - **RAFT channel**: In `ReplLogStore::append` when receiving RAFT log entry + - **Data channel**: In `on_push_data_received` when receiving actual data + - Whichever arrives first creates the repl_req; the other finds it already exists + +--- + +## Append Flow + +The append flow describes how a write operation is replicated and persisted. It involves two parallel channels: +- **Data Channel**: Transfers actual user data from leader to followers +- **RAFT Channel**: Transfers RAFT log entries for consensus + +### High-Level Flow + +``` +Leader: + 1. async_alloc_write (allocate blocks, write data) + 2. raft_server()->append_entries_ext() (propose to RAFT) + 3. RAFT replicates log entries to all members + +All Members (Leader & Followers): + 4. ReplLogStore::append (receive log entry) + 5. HomeRaftLogStore::append (add to logstore in-memory) + 6. LogDev::append_async (add to pending flush queue) + 7. ReplLogStore::end_of_append_batch (flush to disk) + 8. LogDev::flush (physical write) +``` + +--- + +### Step 1: Leader Proposes to RAFT + +**Entry Point**: `RaftReplDev::async_alloc_write` + +**What happens**: +1. Leader receives write request with header, key, and data +2. Allocates blocks from data service +3. Writes actual data to allocated blocks +4. Creates `repl_req` to track this write operation +5. Serializes write info into RAFT log entry +6. Calls `raft_server()->append_entries_ext()` to propose entry to RAFT + +**Key Point**: Leader writes data BEFORE proposing to RAFT. When log is flushed later, data is already on disk. + +**Output**: RAFT begins replicating this log entry to followers + +--- + +### Step 2: RAFT Replicates to All Members + +**What happens**: +- NuRaft library handles replication via network +- All members (including leader) receive log entries via `append()` callback +- RAFT batches multiple log entries together for efficiency + +**Key Point**: From this point on, leader and followers follow the same append path through logstore. + +--- + +### Step 3: Receive Log Entry in ReplLogStore + +**Entry Point**: `ReplLogStore::append` + +**What happens**: +1. Check if entry is app_log (skip config entries) +2. **Create or retrieve repl_req**: + - **For Leader**: repl_req already exists from Step 1 + - **For Follower**: Create repl_req now (first time seeing this entry) + - Can be triggered by **RAFT channel** (this function) OR **data channel** (`on_push_data_received`) + - Whichever arrives first creates the repl_req +3. Link the LSN to repl_req for future lookup +4. Delegate to parent class `HomeRaftLogStore::append` + +**Output**: Returns RAFT LSN (repl_lsn) + +--- + +### Step 4: Add to LogStore (In-Memory) + +**Entry Point**: `HomeRaftLogStore::append` + +**What happens**: +1. Serialize nuraft log entry to buffer format: + ``` + [8 bytes: term][1 byte: log_val_type][N bytes: user data] + ``` +2. Convert RAFT LSN to LogStore LSN: `store_lsn = repl_lsn - 1` +3. Call `HomeLogStore::append_async` with serialized buffer +4. Cache the log entry at position `lsn % cache_size` for fast access +5. Return RAFT LSN + +**Key Point**: Log is still in MEMORY, not on disk yet. + +**Output**: Log entry added to logstore's in-memory pending queue + +--- + +### Step 5: Enqueue to LogDev (In-Memory) + +**Entry Point**: `LogDev::append_async` + +**What happens**: +1. Create `log_record` structure: + ```cpp + log_record { + store_id: which logstore this belongs to + seq_num: logstore sequence number (from HomeLogStore) + data: serialized log entry + context: pointer to logstore_req + } + ``` +2. Assign a unique log_idx (LogDev's internal sequence number) +3. Add record to `m_log_records` stream tracker (in-memory queue) +4. Increment `m_pending_flush_size` by data size +5. Return the seq_num to caller + +**Two Index Values**: +- **log_idx returned by write_at inside this function**: LogDev's internal index, **NOT USED** +- **seq_num in log_record**: This is the logstore's LSN (from `m_next_lsn`), which is the actual LSN used + +**Key Point**: Multiple logstores can share one LogDev. Each record is tagged with store_id. + +**Output**: log_idx assigned, waiting for batch flush + +--- + +### Step 6: End of Append Batch - Coordination + +**Entry Point**: `ReplLogStore::end_of_append_batch` + +This is called by NuRaft after appending a batch of entries (RAFT channel callback). + +**What happens**: + +1. **Separate requests into two groups**: + - **Proposer requests** (leader's own writes): Data already written in Step 1 + - **Non-proposer requests** (follower's received writes): Data may not be written yet + +2. **For non-proposer requests (followers)**: + - Call `notify_after_data_written()` to check if data is written + - Future completes when data is confirmed written to data service + - **Critical**: MUST ensure data write completion before log flush + +3. **Flush logs to disk**: + - Call `HomeRaftLogStore::end_of_append_batch` + - This triggers actual disk flush + +4. **Mark completion**: + - Mark all repl_reqs as `LOG_FLUSHED` + - Set `is_volatile = false` on all repl_reqs + +**Why wait for data?** +If logs are flushed before data is written, upon recovery we'd replay logs but data would be missing, causing data inconsistency. + +**Output**: Logs are now durable on disk, data is written + +--- + +### Step 7: Flush LogStore + +**Entry Point**: `HomeRaftLogStore::end_of_append_batch` + +**What happens**: +1. Calculate end LSN: `end_lsn = store_lsn(start + count - 1)` +2. Call `m_log_store->flush(end_lsn)` +3. Update `m_last_durable_lsn = end_lsn` + +**Entry Point**: `HomeLogStore::flush` + +**What happens**: +- Delegates to LogDev to perform actual flush +- No-op if already flushed + +--- + +### Step 8: Physical Flush to Device + +**Entry Point**: `LogDev::flush` + +This is where logs physically hit the disk. + +**What happens**: + +1. **Gather pending records**: + - Get all log records from `m_log_records` between `m_last_flush_idx` and current `m_log_idx` + - These are the in-memory log records waiting to be flushed + +2. **Create LogGroup**: + - LogGroup is the unit of physical write + - Contains header, multiple log records, inline data, OOB data, and footer + - Format (based on code comments in log_dev.hpp): + ``` + [LogGroup Header: #records, oob_area_offset, inline_area_offset, ...] + [Record 1: size, offset, is_inlined, store_seq_num, store_id] + [Record 2: size, offset, is_inlined, store_seq_num, store_id] + ... + [Inline Data Area: data for small records] + [OOB (Out-of-Band) Data Area: data for large records] + [Footer: magic, start_log_idx] + ``` + + **Inline vs OOB**: + - Small records: Data stored in inline area + - Large records: Data stored in OOB area + - Record's `is_inlined` flag indicates which area to read from + +3. **Allocate space on journal vdev**: + - Call `m_vdev_jd->alloc_next_append_blk(total_size)` + - Returns physical offset on journal device + - Store this in `lg->m_log_dev_offset` + +4. **Assign logdev_key to each record**: + **All records in same LogGroup share the same dev_offset** + ```cpp + logdev_key { + idx: unique log_idx for this record (from LogDev sequence) + dev_offset: lg->m_log_dev_offset (SAME for entire LogGroup) + } + ``` + +5. **Write to physical device**: + - Prepare iovecs (scatter-gather list) from LogGroup + - Call `m_vdev_jd->sync_pwritev(iovecs, offset)` + - This is a synchronous write ensuring data is on disk + +6. **Update tracking**: + - Update `m_last_flush_idx` to last flushed log_idx + - Update `m_last_flush_ld_key` with the logdev_key of last flushed log + - Decrement `m_pending_flush_size` + +7. **Notify completion**: + - For each log record in LogGroup: + - Find corresponding HomeLogStore via store_id + - Call logstore's `on_write_completion(req, logdev_key, flush_ld_key)` + - LogStore updates its in-memory mapping: `seq_num → {logdev_key, trunc_key}` + - Call user's completion callback + +**Key Points**: +- Multiple log records are batched into one LogGroup for efficiency +- Each log gets a unique logdev_key for later retrieval +- Writes are synchronous (blocking) to ensure durability + +**Output**: Logs are now physically durable on journal device + +--- + +### Follower Data Channel Flow + +Followers receive data via data channel in parallel with raft channel. + +**Entry Point**: `RaftReplDev::on_push_data_received` + +**What happens**: +1. Leader calls `push_data_to_all_followers` after Step 1 +2. Follower receives push data RPC +3. Create or retrieve repl_req (may already exist from RAFT channel) +4. Save pushed data buffer in repl_req +5. Write data to data service: `data_service().async_write()` +6. Mark repl_req as `DATA_WRITTEN` and fulfill promise +7. If RAFT channel is waiting in Step 6, it now proceeds + +**Race Handling**: +- **Data arrives first**: Data written, RAFT channel finds data ready +- **RAFT arrives first**: RAFT channel waits for data via future +- Either order works correctly + +--- + +## Compact/Truncate Flow + +Compact/truncate reclaims space from old logs that have been snapshotted and are no longer needed for recovery or replication. + +### Trigger and Context + +**When it happens**: +- RAFT creates snapshot periodically +- After snapshot completes, RAFT calls `compact(upto_lsn)` to remove old logs +- Manually triggered via `trigger_snapshot_creation` + +**Key Constraint - truncation_upper_limit**: +- Calculated by leader based on all followers' replication progress +- Definition: `max LSN that can be safely truncated without affecting any follower` +- Computed in `RaftReplDev::propose_truncate_boundary`: + ``` + minimum_repl_idx = min(follower1_idx, follower2_idx, ...) + truncation_upper_limit = max(commit_lsn - reserve_threshold, minimum_repl_idx) + ``` +- Ensures no follower needs logs we're about to delete + +--- + +### Step 1: Compute Effective Compact LSN + +**Entry Point**: `ReplLogStore::compact` + +**What happens**: +1. RAFT wants to compact up to `compact_upto_lsn` (from snapshot LSN) +2. Get current `truncation_upper_limit` from ReplDev +3. Compute safe compact point: + ```cpp + effective_compact_lsn = min(compact_upto_lsn, truncation_upper_limit) + ``` +4. Call `m_rd.on_compact(effective_compact_lsn)`: + - Updates ReplDev's `m_compact_lsn` to effective_compact_lsn + - This value will be persisted later during checkpoint flush +5. Proceed to compact LogStore: `HomeRaftLogStore::compact(effective_compact_lsn)` + +**Why limit by truncation_upper_limit?** +- RAFT snapshot may be at LSN 10000 +- But a slow follower may only be at LSN 8000 +- We can't truncate logs beyond 8000 or follower can't catch up +- So effective_compact_lsn = min(10000, 8000) = 8000 + +**Output**: Safe LSN determined for truncation + +--- + +### Step 2: Compact in HomeRaftLogStore + +**Entry Point**: `HomeRaftLogStore::compact` + +**What happens**: +1. Validate compact_lsn: + - Check if `compact_lsn > current_max_lsn` + - If true, log a warning (may indicate holes in log) + - Still proceed with truncation +2. Convert RAFT LSN to LogStore LSN: + ```cpp + store_lsn = compact_lsn - 1 + ``` +3. Call `m_log_store->truncate(store_lsn, false)`: + - `false` means perform physical truncation (not in-memory only) + +**Output**: Request to truncate logstore up to store_lsn + +--- + +### Step 3: Truncate LogStore (In-Memory) + +**Entry Point**: `HomeLogStore::truncate` + +**What happens**: + +1. **Flush any pending logs first**: + - Call `flush()` to ensure all in-memory logs are on disk + - This prevents truncating logs that aren't yet durable + +2. **Determine truncation key**: + - If `upto_lsn <= m_tail_lsn` (normal case): + - Get record at `upto_lsn`: `rec = m_records.at(upto_lsn)` + - Extract `m_trunc_ld_key = rec.m_trunc_key` + - This logdev_key marks the truncation boundary + - If `upto_lsn > m_tail_lsn` (baseline resync case): + - Update `m_tail_lsn` and `m_next_lsn` to upto_lsn + - Insert empty record at upto_lsn + - This handles cases where snapshot LSN exceeds current logs + +3. **Truncate in-memory records**: + - Call `m_records.truncate(upto_lsn)` + - Removes all records with `seq_num <= upto_lsn` from memory + - These LSNs are now invalid for future reads + +4. **Update start LSN**: + - Set `m_start_lsn = upto_lsn + 1` + - Future log reads/writes must be >= upto_lsn + 1 + +5. **Trigger physical truncation**: + - If `!in_memory_truncate_only`: call `m_logdev->truncate()` + - This propagates truncation to LogDev layer + +**Key State**: +- `m_start_lsn`: First valid LSN in logstore (upto_lsn + 1) +- `m_tail_lsn`: Last written LSN +- `m_trunc_ld_key`: LogDev key for truncation boundary + +**Output**: In-memory state updated, physical truncation triggered + +--- + +### Step 4: Truncate LogDev (Physical) + +**Entry Point**: `LogDev::truncate` + +**What happens**: + +1. **Lock order** (important for avoiding deadlock): + - Acquire flush_guard (prevents concurrent flush) + - Acquire store_map read lock (protects logstore map) + - Acquire meta_mutex (protects metadata updates) + +2. **Find minimum safe truncation point across all logstores**: + ```cpp + logdev_key min_safe_ld_key = out_of_bound; + + for each logstore on this logdev: + (trunc_lsn, trunc_ld_key, tail_lsn) = logstore->truncate_info() + + // Update logstore superblock with new start LSN + logstore_sb.m_first_seq_num = trunc_lsn + 1 + persist(logstore_sb) + + // Track minimum across all stores + if trunc_ld_key.idx < min_safe_ld_key.idx: + min_safe_ld_key = trunc_ld_key + ``` + + **Why find minimum?** + - One LogDev can serve multiple logstores (though currently 1:1) + - Can only truncate up to the slowest logstore's boundary + - Ensures no logstore loses its needed data + +3. **Handle edge cases**: + - If all logstores are empty: `min_safe_ld_key = m_last_flush_ld_key` + - If no truncation needed (min_safe_ld_key <= m_last_truncate_idx): + - Still persist logstore superblocks + - Return 0 (no space reclaimed) + +4. **Physical truncation on journal vdev**: + - Call `m_vdev_jd->truncate(min_safe_ld_key.dev_offset)` + - Reclaims space on journal device up to this offset + - All data before this offset is now freed + +5. **Update LogDev metadata**: + - Set new `start_dev_offset = min_safe_ld_key.dev_offset` + - Set new `start_log_idx = min_safe_ld_key.idx` + - Persist metadata via `m_logdev_meta.persist()` + - Update `m_last_truncate_idx = min_safe_ld_key.idx` + +6. **Return reclaimed records count**: + ```cpp + num_records = min_safe_ld_key.idx - m_last_truncate_idx + ``` + +**LogDev Metadata Persistence**: +The truncation info is stored in logdev superblock: +```cpp +struct logdev_superblk { + start_dev_offset: physical offset to start reading from + key_idx: log_idx to start reading from + ... +} +``` + +During recovery (`LogDev::do_load`), logs are read from this offset forward. + +**Output**: Physical space reclaimed on journal device + +--- + +### Step 5: Persist Compact LSN in Checkpoint + +**Entry Point**: `RaftReplDev::cp_flush` + +This happens during periodic checkpoint flush. + +**What happens**: +1. Read current `m_compact_lsn` from ReplDev (updated in Step 1) +2. Write to repl dev superblock: + ```cpp + m_rd_sb->compact_lsn = m_compact_lsn.load() + m_rd_sb.write() + ``` +3. Superblock is persisted to meta service + +**Why persist?** +- On recovery, need to know what LSN was compacted to +- Determines safe starting point for log replay +- Prevents re-processing already-snapshotted logs + +**Checkpoint Flush Trigger**: +- Periodic: triggered by CP manager +- Manual: via `trigger_snapshot_creation` +- Before snapshot: to ensure clean state + +**Output**: Compact LSN durable on disk + +--- + +### Complete Flow Summary + +``` +Step 1: ReplLogStore::compact + - Limit by truncation_upper_limit + - effective_compact_lsn = min(snapshot_lsn, truncation_upper_limit) + - Update m_compact_lsn in ReplDev + +Step 2: HomeRaftLogStore::compact + - Convert repl_lsn to store_lsn + +Step 3: HomeLogStore::truncate + - Flush pending logs + - Get m_trunc_ld_key from record + - Truncate in-memory records + - Update m_start_lsn + +Step 4: LogDev::truncate + - Find min truncation point across all logstores + - Persist logstore superblocks + - Truncate journal vdev physically + - Persist logdev metadata + +Step 5: RaftReplDev::cp_flush (during checkpoint) + - Persist m_compact_lsn to repl dev superblock +``` + +--- + +## Rollback Flow + +Rollback handles the case when a follower (or old leader) has uncommitted logs that conflict with the new leader's logs. This happens during leader elections. + +### When Rollback Occurs + +**Scenario**: +1. Old leader appended logs at LSN 100-105 but crashed before committing +2. New leader elected, has committed logs only up to LSN 99 +3. New leader sends its own logs starting at LSN 100 (different from old leader's) +4. Follower (old leader) must rollback LSN 100-105 and accept new leader's logs + +**NuRaft's Two-Phase Process**: +1. **Phase 1 - Rollback**: Rollback invalid logs in **reverse order** (105 → 104 → ... → 100) +2. **Phase 2 - Overwrite**: Write new logs in **forward order** (100 → 101 → ...) + +--- + +### Phase 1: Rollback Invalid Logs (Reverse Order) + +**Entry Point**: NuRaft calls `state_machine->rollback_ext()` for each log in reverse + +**NuRaft's Logic**: +```cpp +// Rollback from last log backwards to conflict point +for (idx = my_last_log_idx; idx >= conflict_idx; idx--) { + log_entry = log_store_->entry_at(idx) + if (log_entry.type == app_log) { + state_machine_->rollback_ext(idx, log_entry.buffer) + } else if (log_entry.type == conf) { + state_machine_->rollback_config(idx, config) + } +} +``` + +**What happens**: +1. **For each log from 105 down to 100**: + - Read the log entry from logstore + - If app_log: Call `RaftStateMachine::rollback_ext(idx, buffer)` + - If config_log: Call `RaftStateMachine::rollback_config(idx, config)` + +2. **In RaftStateMachine::rollback_ext**: + - Notify application layer via listener: `m_listener->on_pre_commit_rollback(lsn, ...)` + - Application can undo any in-memory state changes + - Does NOT remove logs from logstore yet + +**Key Point**: This is a notification phase, allowing application to clean up state before logs are physically removed. + +--- + +### Phase 2: Overwrite with New Logs (Forward Order) + +After rollback phase completes, NuRaft proceeds to overwrite. + +**Entry Point**: NuRaft calls `log_store_->write_at(index, entry)` for each new log + +**NuRaft's Logic**: +```cpp +// Overwrite logs in forward order +while (log_idx < next_slot() && has_more_entries) { + entry = new_entries[count] + log_store_->write_at(log_idx, entry) + + if (entry.type == app_log) { + state_machine_->pre_commit_ext(log_idx, entry.buffer) + } + + log_idx++ + count++ +} +``` + +**What happens**: +1. **For each new log from 100 onwards**: + - Call `ReplLogStore::write_at(index, entry)` + - Call `RaftStateMachine::pre_commit_ext()` to notify application + +--- + +### Step 1: Write At Specific Index + +**Entry Point**: `ReplLogStore::write_at` + +**What happens**: +1. Check if entry is app_log (skip config entries) +2. Create or retrieve repl_req from journal entry +3. Link LSN to repl_req +4. Delegate to `HomeRaftLogStore::write_at(index, entry)` + +**Key Point**: This is called for EACH log in the overwrite range (100, 101, 102, ...) + +--- + +### Step 2: Rollback and Append Single Log + +**Entry Point**: `HomeRaftLogStore::write_at` (called for EACH index) + +**What happens**: + +1. **Serialize new log entry**: + - Format: `[term][type][data]` + +2. **Rollback logstore to before this index**: + - Call `m_log_store->rollback(to_store_lsn(index) - 1)` + - Example: For index=100, rollback to store_lsn=99 + - This removes log at index 100 and all after it + +3. **Reset durable LSN**: + - Set `m_last_durable_lsn = -1` + - Will be recalculated on next flush + +4. **Append new log at this index**: + - Call `m_log_store->append_async(new_data)` + - Since we just rolled back to 99, next append is at 100 + +5. **Validate LSN matches**: + - Assert `appended_lsn == index` + +6. **Clear cache entries after this index**: + - Remove cached entries with LSN > index + - Prevents serving stale data + +7. **Flush immediately**: + - Call `end_of_append_batch(index, 1)` + - Ensures new log is durable + +**Why rollback before each write_at?** +- First write_at(100): Removes 100-105, writes new 100 +- Second write_at(101): Removes 101 onwards (already gone), writes new 101 +- Ensures each new log is written cleanly + +--- + +### Step 3: Rollback In-Memory State + +**Entry Point**: `HomeLogStore::rollback` + +**What happens**: + +1. **Rollback next LSN**: + - Set `m_next_lsn = upto_lsn + 1` + - Example: rollback(99) sets m_next_lsn = 100 + +2. **Truncate in-memory records**: + - Call `m_records.truncate(upto_lsn)` + - Removes all records with `seq_num > upto_lsn` + +3. **Update tail LSN** (if needed): + - Set `m_tail_lsn = upto_lsn` + +**Key Point - No Physical Truncation**: +- Does NOT call `m_logdev->truncate()` +- Old logs remain on disk but are logically invalid +- Will be overwritten when new logs are appended +- Physical space reclaimed during next compact + +**Why no physical truncation?** +1. **Efficiency**: Rollback is rare, no need for expensive truncation +2. **Simplicity**: Just overwrite in-place +3. **Safety**: Old data remains until overwritten + +--- + +### Complete Rollback Flow Example + +``` +Initial State (Old Leader): + RAFT LSN: 97 98 99 100 101 102 103 104 105 + Committed: ✓ ✓ ✓ ✗ ✗ ✗ ✗ ✗ ✗ + +Leader Change → New Leader has different logs at 100-102 + +Phase 1 - NuRaft Rollback (Reverse Order): + rollback_ext(105) → Notify app to undo state for 105 + rollback_ext(104) → Notify app to undo state for 104 + rollback_ext(103) → Notify app to undo state for 103 + rollback_ext(102) → Notify app to undo state for 102 + rollback_ext(101) → Notify app to undo state for 101 + rollback_ext(100) → Notify app to undo state for 100 + +Phase 2 - NuRaft Overwrite (Forward Order): + write_at(100, new_entry_100): + → rollback(99) removes 100-105 from logstore + → append new log 100 + → flush + + write_at(101, new_entry_101): + → rollback(100) removes 101 onwards (already gone) + → append new log 101 + → flush + + write_at(102, new_entry_102): + → rollback(101) removes 102 onwards (already gone) + → append new log 102 + → flush + +Final State: + RAFT LSN: 97 98 99 100 101 102 + Committed: ✓ ✓ ✓ ✗ ✗ ✗ (will commit later) + + Logs 100-102 now contain NEW data from new leader + Old logs 100-105 are orphaned on disk +``` + +--- + +### Rollback vs Compact + +| Aspect | Rollback | Compact | +|--------|----------|---------| +| **When** | Leader change, uncommitted logs | After snapshot, committed logs | +| **Scope** | From rollback point to end | From start to compact point | +| **Physical** | No disk truncation | Physical disk truncation | +| **Data** | Overwritten in-place | Reclaimed via truncate | +| **Frequency** | Rare (only on failures) | Regular (after snapshots) | + +--- + +## Recovery Flow + +Recovery rebuilds the in-memory state of RAFT logs from persistent storage after restart. + +### Overview + +``` +Restart → LogDev::do_load() → Read logs from disk → +on_log_found callbacks → Rebuild logstore mappings → +RAFT replay → Rebuild state machine +``` + +--- + +### Step 1: Load LogDev Metadata + +**Entry Point**: `LogDev::start` (when format=false) + +**What happens**: +1. Read logdev superblock from meta service: + ```cpp + struct logdev_superblk { + start_dev_offset: where to start reading logs + key_idx: log_idx to start from + num_stores: how many logstores on this logdev + ... + } + ``` + +2. Read logstore superblocks for each store: + ```cpp + struct logstore_superblk { + m_first_seq_num: first valid LSN for this logstore + } + ``` + +3. Initialize LogDev state: + - Set `m_log_idx = key_idx` (resume log index sequence) + - Prepare to read from `start_dev_offset` + +**Output**: LogDev knows where to start reading + +--- + +### Step 2: Scan and Parse Logs + +**Entry Point**: `LogDev::do_load` + +**What happens**: + +1. **Create log stream reader**: + - Start reading from `start_dev_offset` + - Uses `log_stream_reader` to parse LogGroups + +2. **Read LogGroup by LogGroup**: + ```cpp + while not end of device: + LogGroup = read_next_log_group() + validate_header(LogGroup.header) + for each record in LogGroup: + dispatch to correct logstore + ``` + +3. **Validate LogGroup**: + - Check magic number: `LOG_GROUP_HDR_MAGIC` + - Verify CRC matches + - Check prev_grp_crc links to last group + +4. **Extract log records**: + - Parse each `serialized_log_record` from LogGroup + - Structure: + ```cpp + serialized_log_record { + size: size of log data + offset: offset within LogGroup + is_inlined: whether data is inline or OOB + store_seq_num: logstore LSN + store_id: which logstore this belongs to + } + ``` + +5. **Call logstore callback**: + - For each record: `on_logfound(store_id, seq_num, logdev_key, flush_ld_key, data, ...)` + - Passes to correct HomeLogStore based on store_id + +**Key Points**: +- Reads sequentially from start_dev_offset to end +- Stops at first invalid LogGroup (end of valid logs) +- Handles inline vs OOB data within LogGroup + +**Output**: All log records dispatched to logstores + +--- + +### Step 3: Rebuild HomeLogStore State + +**Entry Point**: `HomeLogStore::on_log_found` + +**What happens**: + +1. **Filter by start LSN**: + - If `seq_num < m_start_lsn`: skip this record + - `m_start_lsn` comes from logstore superblock (after truncation) + +2. **Update tail LSN**: + - If `seq_num > m_tail_lsn`: update `m_tail_lsn = seq_num` + - Track highest LSN seen + +3. **Determine truncation key**: + ```cpp + if seq_num > m_tail_lsn: + trunc_key = flush_ld_key // This is new tail + else: + trunc_key = m_records.at(m_tail_lsn).m_trunc_key // Inherit from current tail + ``` + +4. **Create logstore record**: + - Build in-memory record: + ```cpp + logstore_record { + m_dev_key: logdev_key (where to read this log) + m_trunc_key: logdev_key (truncation boundary) + } + ``` + - Insert into `m_records` at seq_num + +5. **Update next LSN**: + - Set `m_next_lsn = max(m_next_lsn, seq_num + 1)` + - Ensures next append doesn't conflict + +6. **Call user callback** (if registered): + - User-provided `log_found_cb(seq_num, data, context)` + - Allows upper layer to examine logs during recovery + +**Output**: HomeLogStore has complete `seq_num → logdev_key` mapping + +--- + + +### Recovery Example + +**Persistent State**: +``` +LogDev Superblock: + start_dev_offset: 1024 + key_idx: 50 + +LogStore Superblock: + m_first_seq_num: 100 (after last truncation) + +Journal Device: + [Offset 1024]: LogGroup containing records 100-105 + [Offset 2048]: LogGroup containing records 106-110 + ... +``` + +**Recovery Steps**: +1. Read from offset 1024 +2. Parse LogGroups +3. Extract records 100-110 (skip <100) +4. Rebuild mapping: 100→{idx:50, off:1024}, 101→{idx:51, off:1056}, ... +5. RAFT replays logs 100-110 +6. Resume at LSN 111 + +--- + +## Summary + +### Append Flow +**Purpose**: Persist new write operations to disk +**Key Steps**: +1. Leader writes data to data service +2. Leader proposes to RAFT via `raft_server()->append_entries_ext()` +3. RAFT replicates to all members via `append()` +4. All members persist to LogStore (in-memory) +5. LogDev batches records into LogGroup +6. `end_of_append_batch` triggers flush to disk +7. LogDev physically writes to journal device + +**Critical Path**: Data must be written before log flush (for followers) + +--- + +### Compact/Truncate Flow +**Purpose**: Reclaim space from old snapshotted logs +**Key Steps**: +1. RAFT creates snapshot at LSN X +2. Compute `effective_compact_lsn = min(X, truncation_upper_limit)` +3. Truncate LogStore in-memory (update start_lsn) +4. LogDev finds minimum truncation point across all stores +5. Physically truncate journal device +6. Persist compact_lsn in checkpoint + +**Key Constraint**: `truncation_upper_limit` ensures no follower is left behind + +--- + +### Rollback Flow +**Purpose**: Resolve log conflicts during leader changes +**Key Steps**: +1. New leader sends `write_at(index, new_entry)` +2. Rollback LogStore to index-1 (in-memory only) +3. Append new log at same index +4. Immediately flush to disk +5. Old logs orphaned, reclaimed during next compact + +**Key Point**: No physical truncation, just in-memory rollback + overwrite + +--- + +### Recovery Flow +**Purpose**: Rebuild state after restart +**Key Steps**: +1. Read LogDev metadata (start_dev_offset, start_lsn) +2. Scan LogGroups from start_dev_offset +3. Dispatch records to logstores via callbacks +4. Rebuild `seq_num → logdev_key` mapping +5. RAFT replays logs from snapshot_lsn + 1 +6. Resume normal operation + +**Key Invariant**: `logstore_lsn = raft_lsn - 1` + +--- diff --git a/docs/structures/raft_repl_dev_log_dev.md b/docs/structures/raft_repl_dev_log_dev.md new file mode 100644 index 000000000..321140b03 --- /dev/null +++ b/docs/structures/raft_repl_dev_log_dev.md @@ -0,0 +1,532 @@ +# RaftReplDev and Log Storage Architecture + +This document explains the full layered architecture of HomeStore's log storage subsystem, and how `RaftReplDev` uses it. It covers the relationship between **Log VDev**, **LogDev**, **LogStore**, and **RaftReplDev**, as well as how multiple Placement Groups (PGs) share chunks on the Log VDev. + +--- + +## 1. Layered Architecture Overview + +``` +┌────────────────────────────────────────────────────────────────────┐ +│ RaftReplDev (one per Raft replication group / PG) │ +│ m_data_journal : ReplLogStore │ +│ m_free_blks_journal: HomeLogStore (timeline consistency only) │ +├────────────────────────────────────────────────────────────────────┤ +│ ReplLogStore : HomeRaftLogStore : nuraft::log_store │ +│ Adapter between the NuRaft log_store interface and HomeStore │ +├────────────────────────────────────────────────────────────────────┤ +│ HomeLogStore (one logical LSN namespace, one logstore_id) │ +│ Multiple HomeLogStores can share a single LogDev │ +├────────────────────────────────────────────────────────────────────┤ +│ LogDev (one per PG, one logdev_id) │ +│ Group-commit engine, flush scheduler, multi-store manager │ +├────────────────────────────────────────────────────────────────────┤ +│ JournalVirtualDev (singleton, shared by ALL LogDevs) │ +│ JournalVirtualDev::Descriptor (one per LogDev, exclusive window) │ +│ Physical disk chunks (dynamically allocated from global pool) │ +└────────────────────────────────────────────────────────────────────┘ +``` + +### Key files + +| Layer | Files | +|-------|-------| +| Log VDev | `src/lib/device/journal_vdev.hpp`, `src/lib/device/journal_vdev.cpp` | +| LogDev | `src/lib/logstore/log_dev.hpp`, `src/lib/logstore/log_dev.cpp` | +| LogStoreService | `src/include/homestore/logstore_service.hpp`, `src/lib/logstore/log_store_service.cpp` | +| HomeLogStore | `src/include/homestore/logstore/log_store.hpp`, `src/lib/logstore/log_store.cpp` | +| HomeRaftLogStore | `src/lib/replication/log_store/home_raft_log_store.h`, `.cpp` | +| ReplLogStore | `src/lib/replication/log_store/repl_log_store.h`, `.cpp` | +| RaftReplDev | `src/lib/replication/repl_dev/raft_repl_dev.h`, `.cpp` | + +--- + +## 2. Layer-by-Layer Description + +### 2.1 JournalVirtualDev (Log VDev) + +`JournalVirtualDev` (`src/lib/device/journal_vdev.hpp`) is the **raw storage backend** for all log data. There is exactly **one** `JournalVirtualDev` instance per HomeStore process, held by `LogStoreService::m_logdev_vdev` and shared by all LogDevs. + +It manages a **global chunk pool** (`m_chunk_pool`). Chunks are dynamically allocated from this pool on demand and returned to it when freed by truncation. + +Each LogDev gets its own private **Descriptor** — a sliding-window view into the VDev: + +```cpp +// src/lib/device/journal_vdev.hpp +struct JournalChunkPrivate { + logdev_id_t logdev_id{0}; // which LogDev owns this chunk + bool is_head{false}; // is it the head of the chain? + uint64_t created_at{0}; // creation timestamp (for crash recovery) + uint64_t end_of_chunk{0}; // where valid data ends in this chunk + chunk_num_t next_chunk{0}; // next chunk in this LogDev's chain +}; + +struct Descriptor { + logdev_id_t m_logdev_id; + off_t m_data_start_offset; // left boundary (advances on truncate) + off_t m_end_offset; // right boundary (advances on new chunk) + std::vector> m_journal_chunks; // ordered chunk chain + std::atomic m_write_sz_in_total; + uint64_t m_reserved_sz; // space reserved but not yet written + uint64_t m_total_size; +}; +``` + +**All offsets are monotonically increasing — they never wrap around.** + +`tail_offset() = m_data_start_offset + m_write_sz_in_total + m_reserved_sz` + +A **critical invariant**: a single LogGroup write must be smaller than one chunk size. Writes are therefore guaranteed not to span chunk boundaries: +```cpp +// journal_vdev.cpp:259 +RELEASE_ASSERT_LT(sz, chunk_size, "Size requested greater than chunk size"); +``` + +### 2.2 LogDev + +`LogDev` (`src/lib/logstore/log_dev.hpp`) is a logical "log band" with its own `logdev_id`. Each PG owns one exclusive LogDev. + +Key responsibilities: +- **Group commit**: batches multiple `log_record`s into a `LogGroup` and writes them atomically. At most 2 `LogGroup` objects exist simultaneously (`max_log_group = 2`). +- **Manages multiple HomeLogStores**: a LogDev can contain several stores, each with a distinct `logstore_id`. +- **Flush scheduling**: supports three modes — `INLINE`, `TIMER`, and `EXPLICIT` (Raft always uses `EXPLICIT`). +- **Owns a monotonic global log index** (`m_log_idx`) used to determine truncation safety. + +```cpp +// log_dev.hpp (key members) +std::atomic m_log_idx{0}; // ever-increasing log index +std::shared_ptr m_vdev; +shared m_vdev_jd; // exclusive descriptor +std::unordered_map m_id_logstore_map; +LogGroup m_log_group_pool[max_log_group]; // pool of 2 log groups +flush_mode_t m_flush_mode; +``` + +### 2.3 HomeLogStore + +`HomeLogStore` (`src/include/homestore/logstore/log_store.hpp`) is an independent logical LSN namespace. Multiple stores can coexist within one LogDev. + +Each store holds: +- `m_store_id`: its unique ID within the LogDev +- `m_logdev`: back-pointer to its LogDev +- `m_records`: a `StreamTracker` mapping `logstore_seq_num → logdev_key{idx, dev_offset}` +- `m_start_lsn`, `m_next_lsn`, `m_tail_lsn` + +Write path: +```cpp +// log_store.cpp:64 +auto ret = m_logdev->append_async(m_store_id, req->seq_num, req->data, ...); +``` + +Read path: +```cpp +// log_store.cpp:132-143 +const auto record = m_records.at(seq_num); +const logdev_key ld_key = record.m_dev_key; +const auto b = m_logdev->read(ld_key); +``` + +### 2.4 LogStoreService + +`LogStoreService` (`src/include/homestore/logstore_service.hpp`) is the global manager (singleton via `logstore_service()`). It holds the one `JournalVirtualDev` and a map of `logdev_id → LogDev`. + +--- + +## 3. RaftReplDev and the Log Storage Stack + +### 3.1 Ownership Structure + +``` +LogStoreService (singleton) + ├── m_logdev_vdev : JournalVirtualDev ← one global VDev + └── m_id_logdev_map : { logdev_id → LogDev } + └── LogDev (one per PG) + ├── m_vdev_jd : Descriptor ← exclusive VDev window + ├── m_id_logstore_map: + │ ├── HomeLogStore (data_journal, append_mode=true) + │ └── HomeLogStore (free_blks_journal, append_mode=false, optional) + └── m_logdev_meta : superblock (persists logstore_id list) + +RaftReplDev (one per Raft group) + ├── m_data_journal : ReplLogStore + │ └── HomeRaftLogStore + │ └── m_log_store : HomeLogStore ← same object as above + └── m_free_blks_journal : HomeLogStore (optional, for timeline consistency) + +m_rd_sb persists: { logdev_id, logstore_id } ← used for recovery +``` + +### 3.2 Creation (New PG) + +```cpp +// raft_repl_dev.cpp:71-86 (new RaftReplDev, load_existing=false) +m_data_journal = std::make_shared(*this, *m_state_machine); +m_rd_sb->logdev_id = m_data_journal->logdev_id(); +m_rd_sb->logstore_id = m_data_journal->logstore_id(); + +// If timeline_consistent: +m_free_blks_journal = logstore_service().create_new_log_store( + m_rd_sb->logdev_id, false /* append_mode */); +m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); +``` + +Inside `HomeRaftLogStore` constructor: +```cpp +// home_raft_log_store.cpp:94-98 +// logstore_id == UINT32_MAX means brand-new +m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); +m_log_store = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); +m_logstore_id = m_log_store->get_store_id(); +``` + +Each Raft group gets its **own exclusive LogDev** with `flush_mode = EXPLICIT`. + +### 3.3 Recovery (Existing PG) + +```cpp +// raft_repl_dev.cpp:42-49 +m_data_journal = std::make_shared( + *this, *m_state_machine, + m_rd_sb->logdev_id, m_rd_sb->logstore_id, + [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(...); }, + [this](auto hs, auto lsn) { m_log_store_replay_done = true; ... }); +``` + +Inside `HomeRaftLogStore` constructor: +```cpp +// home_raft_log_store.cpp:100-113 +m_logdev_id = logdev_id; +m_logstore_id = logstore_id; +logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT); +m_log_store_future = logstore_service() + .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) + .thenValue([this](auto log_store) { m_log_store = std::move(log_store); }); +``` + +### 3.4 LSN Offset Convention + +Raft LSNs start from 1; HomeLogStore `seq_num` starts from 0. They differ by exactly 1: + +```cpp +// home_raft_log_store.cpp:40-44 +static constexpr logstore_seq_num_t to_store_lsn(uint64_t raft_lsn) { + return static_cast(raft_lsn - 1); +} +// home_raft_log_store.h:258 +static constexpr repl_lsn_t to_repl_lsn(store_lsn_t store_lsn) { return store_lsn + 1; } +``` + +### 3.5 Write Path (End to End) + +``` +NuRaft::append_entries() + → ReplLogStore::append() [repl_log_store.cpp:9] + → m_sm.localize_journal_entry_finish() // translate remote data pointers to local + → HomeRaftLogStore::append() [home_raft_log_store.cpp:164] + → m_log_store->append_async() // HomeLogStore (auto-assigns next LSN) + → LogDev::append_async() // assigns logid, stores to StreamTracker + + → ReplLogStore::end_of_append_batch() [repl_log_store.cpp:41] + → m_rd.notify_after_data_written() // wait for block data to be persisted + → HomeRaftLogStore::end_of_append_batch() + → m_log_store->flush() + → LogDev::flush_under_guard() + → LogDev::flush() + → prepare_flush() // pack pending log_records into LogGroup + → m_vdev_jd->alloc_next_append_blk() // may trigger append_chunk() + → m_vdev_jd->sync_pwritev() // write to physical disk + → on_flush_completion() // notify HomeLogStore callbacks +``` + +--- + +## 4. How 10 PGs Share 100 Chunks + +Assume: 1 `JournalVirtualDev` with 100 chunks (e.g., 256 MB each), 10 PGs. + +### 4.1 Chunk Allocation Principle + +**Chunks are NOT pre-partitioned across PGs.** All 100 chunks start in the global `ChunkPool`. Each LogDev starts with zero chunks and acquires them on demand. + +When a flush needs space and `tail_offset + write_size >= m_end_offset`: + +```cpp +// journal_vdev.cpp:261-277 alloc_next_append_blk() +if ((tail_offset() + sz) >= m_end_offset) { + append_chunk(); // dequeue one chunk from the global pool +} +``` + +```cpp +// journal_vdev.cpp:208-253 append_chunk() +auto new_chunk = m_vdev.m_chunk_pool->dequeue(); // take from global pool +m_total_size += new_chunk->size(); +m_end_offset += new_chunk->size(); + +if (m_journal_chunks.empty()) { + // First chunk: mark as head, stamp logdev_id + new_chunk_private->is_head = true; + new_chunk_private->logdev_id = m_logdev_id; + new_chunk_private->end_of_chunk = chunk_size; +} else { + // Subsequent chunk: link previous chunk's next_chunk pointer to this one + // and record the actual end_of_chunk in the previous chunk + last_chunk_private->next_chunk = new_chunk->chunk_id(); + last_chunk_private->end_of_chunk = offset_in_chunk; // watermark + m_vdev.update_chunk_private(last_chunk, ...); // persist metadata +} +m_journal_chunks.push_back(new_chunk); +m_vdev.update_chunk_private(new_chunk, ...); +``` + +The chunk's private metadata is **persisted to disk immediately** each time ownership changes, enabling crash-safe recovery. + +### 4.2 Chunk Exclusivity + +A chunk belongs to exactly one entity at any moment: + +| State | Owner | +|-------|-------| +| In `ChunkPool` | No LogDev — free for any | +| In a `Descriptor::m_journal_chunks` | That LogDev exclusively | + +There is no sharing of chunks between LogDevs. The `logdev_id` field in `JournalChunkPrivate` records current ownership. + +### 4.3 Physical Layout on Disk + +Chunks from different LogDevs are **physically interleaved** on disk in no particular order. The logical ordering within a LogDev is maintained solely through the `next_chunk` linked-list pointers stored in each chunk's private metadata. + +Example snapshot (chunks from 10 PGs scattered on disk): + +``` +Physical disk: + chunk[0] → ChunkPool (free) + chunk[1] → PG_2 (head, logdev=2, next=55) + chunk[2] → ChunkPool (free) + chunk[3] → PG_0 (head, logdev=0, next=17) + chunk[4] → ChunkPool (free) + ... + chunk[17] → PG_0 (logdev=0, next=42) + chunk[23] → PG_1 (head, logdev=1, next=0, currently writing) + chunk[42] → PG_0 (logdev=0, next=0, currently writing) + chunk[55] → PG_2 (logdev=2, next=78) + chunk[78] → PG_2 (logdev=2, next=0, currently writing) + ... + remaining → ChunkPool (free) +``` + +Each LogDev's virtual offset space is independent: + +``` +PG_0 Descriptor (virtual tape): + ├── [0 .. chunk_size) chunk[3] (head) + ├── [chunk_size .. 2*chunk_size) chunk[17] + └── [2*chunk_sz .. 3*chunk_size) chunk[42] ← m_end_offset + + m_data_start_offset = 0 (no truncation yet) + tail_offset() ≈ 2*chunk_sz + bytes_written_in_chunk[42] +``` + +### 4.4 Flush Principle + +Flush modes (set per LogDev): + +| Mode | Trigger | Used by | +|------|---------|---------| +| `EXPLICIT` | Caller invokes `flush_under_guard()` | Raft (`end_of_append_batch`) | +| `INLINE` | After `append_async`, if `pending_size >= threshold` | Non-Raft stores | +| `TIMER` | Periodic timer checks if there is pending unflushed data | Non-Raft stores | + +The `LogGroup` header carries a CRC and `prev_grp_crc` chain for integrity validation during recovery. + +**Write alignment constraint**: each `LogGroup` flush must fit within a single chunk: +```cpp +RELEASE_ASSERT_LT(sz, chunk_size, "Size requested greater than chunk size"); +``` +If the remaining space in the current chunk cannot fit the next `LogGroup`, a new chunk is obtained first, then the write proceeds. + +### 4.5 Truncate Principle + +Truncation flows through all layers: + +#### Step 1 — HomeLogStore layer (in-memory) +```cpp +// log_store.cpp:208-258 +bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_only) { + m_records.truncate(upto_lsn); // release in-memory StreamTracker entries + m_start_lsn.store(upto_lsn + 1); + if (!in_memory_only) m_logdev->truncate(); +} +``` + +#### Step 2 — LogDev layer (find safe truncation point) +```cpp +// log_dev.cpp:631-665 LogDev::truncate() +logdev_key min_safe_ld_key = logdev_key::out_of_bound_ld_key(); +for (auto& [store_id, store] : m_id_logstore_map) { + auto [trunc_lsn, trunc_ld_key, tail_lsn] = store.log_store->truncate_info(); + if (trunc_ld_key.idx < min_safe_ld_key.idx) { + min_safe_ld_key = trunc_ld_key; // take the most conservative point + } +} +// All stores (data_journal AND free_blks_journal) must agree before truncating. +m_vdev_jd->truncate(min_safe_ld_key.dev_offset); +m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, ...); +``` + +#### Step 3 — Descriptor layer (release chunks) +```cpp +// journal_vdev.cpp:569-687 Descriptor::truncate(truncate_offset) +// 1. Find which chunk contains truncate_offset → this becomes the new head. +// 2. Persist is_head=true on the new head chunk (crash-safe: two heads may +// exist transiently; recovery picks the one with higher created_at). +// 3. Walk the chunk list from the old head: +for (auto it = m_journal_chunks.begin(); ...) { + if (cover_offset <= truncate_offset) { + m_total_size -= chunk->size(); + m_journal_chunks.erase(it); + m_vdev.release_chunk_to_pool(chunk); // return to global pool + } +} +// 4. Update m_data_start_offset = truncate_offset. +// 5. Reduce m_write_sz_in_total by size_to_truncate. +``` + +```cpp +// journal_vdev.cpp:156-166 release_chunk_to_pool() +*data = JournalChunkPrivate{}; // clear logdev_id and all metadata +update_chunk_private(chunk, data); // persist the clear (crash-safe) +m_chunk_pool->enqueue(chunk); // chunk is now available to any LogDev +``` + +**A chunk is released only when the truncation offset completely covers it.** The chunk containing the new `m_data_start_offset` is retained. + +### 4.6 Recovery: Reconstructing Chunk Chains + +At startup, `JournalVirtualDev::init()` reconstructs each LogDev's chunk chain from persisted metadata: + +```cpp +// journal_vdev.cpp:72-138 +void JournalVirtualDev::init() { + // 1. Scan all chunks; build: logdev_id → head_chunk, chunk_id → chunk + for (auto& [_, chunk] : m_all_chunks) { + if (data->is_head) { + // Among multiple is_head chunks for the same logdev_id, + // pick the one with the highest created_at timestamp. + logdev_head_map[logdev_id] = { chunk_id, created_at }; + } + } + + // 2. For each logdev, follow next_chunk pointers to rebuild the ordered list. + for (auto& [logdev_id, head] : logdev_head_map) { + auto journal_desc = std::make_shared(*this, logdev_id); + auto chunk_num = head.chunk_num; + while (chunk_num != 0) { + journal_desc->m_journal_chunks.push_back(chunk_map[chunk_num]); + chunk_num = data->next_chunk; + } + } + + // 3. Chunks not in any chain are orphans — remove them. + for (auto& [_, chunk] : m_all_chunks) { + if (!visited_chunks.count(chunk->chunk_id())) { + remove_journal_chunks(orphan_chunks); + } + } +} +``` + +The **crash-safe two-head mechanism**: when truncation persists `is_head=true` on the new head before releasing old chunks, a crash leaves two chunks with `is_head=true`. Recovery resolves this by selecting the one with the **higher `created_at`** timestamp, treating the other as orphaned. + +--- + +## 5. Log VDev Capacity and Chunk Allocation Internals + +This section explains why `LogStoreService::create_vdev()` passes `vdev_size=0` and `num_chunks=0` for the Log VDev, how the system still knows the allocatable capacity, and where the actual allocation logic lives. + +### 5.1 Why `vdev_size=0` and `num_chunks=0` still works + +The Log VDev (`JournalVirtualDev`) is created with a *dynamic size* configuration (see `src/lib/logstore/log_store_service.cpp`, `LogStoreService::create_vdev()`): it does **not** pre-create a fixed number of chunks at format time. + +Instead, it relies on an internal **ChunkPool** which can *create* new chunks on demand. The real upper bound is determined by the **physical device data region** and by available chunk-id / chunk-info slots. + +### 5.2 Where chunks come from: `ChunkPool` producer thread + +`JournalVirtualDev` constructs a `ChunkPool` (`src/lib/device/journal_vdev.cpp:47-57`). The pool maintains a background producer thread which creates chunks via `DeviceManager::create_chunk()` when the pool drops below a threshold. + +Key code path: + +- `ChunkPool::producer()` (`src/lib/device/device_manager.cpp:873-899`) + - calls `DeviceManager::create_chunk(dev_type, vdev_id, chunk_size, private_data)` + +### 5.3 Global allocation constraints + +A new chunk can be created only if all of the following remain available: + +1. **Global chunk-id space** (DeviceManager-level) + - `DeviceManager::create_chunk()` reserves a new `chunk_id` from `m_chunk_id_bm`. + - If there are no remaining IDs, it throws: `"System has no room for additional chunk"`. + +2. **Per-PDev chunk-info slots** (PhysicalDev-level) + - `PhysicalDev::create_chunk()` allocates a free *slot* in `m_chunk_info_slots`. + - If there are no remaining slots, it throws: `"System has no room for additional chunk"`. + +3. **Per-PDev free space in the data region** (PhysicalDev-level) + - This is enforced by `PhysicalDev::find_next_chunk_area()`. + +### 5.4 The physical device data region: `data_start_offset()` and `data_end_offset()` + +Chunk data cannot be placed at arbitrary offsets on a physical device. It must be placed inside the device's **data region**, which excludes the HomeStore superblocks/metadata areas. + +- The data start offset is computed during formatting in `DeviceManager::populate_pdev_info()` (`src/lib/device/device_manager.cpp:708-713`): + - `pinfo.data_offset = round_up(first_block_offset + total_superblock_size, phys_page_size)` + +- PhysicalDev exposes: + - `data_start_offset() = m_pdev_info.data_offset` (`src/lib/device/physical_dev.hpp:226`) + - `data_end_offset()` is the end of usable device space, adjusted if the superblock is mirrored in the footer (`src/lib/device/physical_dev.hpp:227-229`). + +Because chunk data must not overlap the superblock areas, chunk allocation searches only inside `[data_start_offset, data_end_offset)`. + +### 5.5 How free space is tracked: `m_chunk_data_area` + +Each `PhysicalDev` maintains an in-memory interval set of allocated chunk data ranges: + +- `PhysicalDev::m_chunk_data_area` (`src/lib/device/physical_dev.hpp:138`) + +It is populated in two ways: + +1. **Recovery path**: as existing chunks are loaded from the superblock, their occupied ranges are inserted + - `PhysicalDev::load_chunks()` inserts `[chunk_start_offset, chunk_start_offset + chunk_size)` into `m_chunk_data_area` (`src/lib/device/physical_dev.cpp:400-403`). + +2. **Creation path**: when a new chunk is created, the newly selected range is inserted + - `PhysicalDev::populate_chunk_info()` inserts the chosen interval (`src/lib/device/physical_dev.cpp:448-453`). + +Chunk deletion removes the corresponding range: +- `PhysicalDev::free_chunk_info()` erases the interval (`src/lib/device/physical_dev.cpp:464-467`). + +### 5.6 The actual placement algorithm: `find_next_chunk_area()` + +The physical placement decision is made by `PhysicalDev::find_next_chunk_area(size)` (`src/lib/device/physical_dev.cpp:474-485`). It performs a simple first-fit scan: + +- Start with the candidate interval `[data_start_offset, data_start_offset + size)`. +- If it overlaps an existing allocated interval, move the candidate to start at `exist_ival.upper()`. +- If the candidate's upper bound exceeds `data_end_offset`, allocation fails with `"Physical dev has no room for additional chunk"`. + +This means **chunks from different VDevs can be physically interleaved on the same physical device**, because all VDevs allocate from the same per-PDev data region and are only separated by chunk ownership metadata (`chunk_info.vdev_id`). + +--- + +## 6. Summary of Key Properties + +| Property | Behavior | +|----------|----------| +| Chunk allocation | Demand-driven from global pool; no pre-partitioning | +| Chunk exclusivity | A chunk belongs to exactly one LogDev at any time | +| Physical layout | Chunks from different LogDevs are interleaved on disk | +| Offset space | Per-LogDev, monotonically increasing, never wraps | +| Write boundary | A single LogGroup flush must fit within one chunk | +| Flush mode (Raft) | `EXPLICIT` — triggered by `end_of_append_batch` only | +| Truncation safety | Governed by the minimum `trunc_ld_key` across all LogStores in a LogDev | +| Chunk release | Only when truncation fully covers the chunk | +| Recovery | Per-chunk metadata (`logdev_id`, `next_chunk`, `is_head`, `created_at`) | +| Crash safety | New head persisted before old chunks released; two-head resolved by `created_at` | diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 3e4dda2a0..2c19910f7 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -566,7 +566,7 @@ void JournalVirtualDev::Descriptor::update_tail_offset(off_t tail) { LOGINFOMOD(journalvdev, "Updated tail offset arg 0x{} desc {} ", to_hex(tail), to_string()); } -off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset) { +off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset, logid_t log_idx) { const off_t ds_off = data_start_offset(); COUNTER_INCREMENT(m_vdev.m_metrics, vdev_truncate_count, 1); HS_PERIODIC_LOG(DEBUG, journalvdev, "truncating to logical offset: 0x{} desc {}", to_hex(truncate_offset), @@ -605,6 +605,14 @@ off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset) { auto* private_data = r_cast< JournalChunkPrivate* >(const_cast< uint8_t* >(new_head_chunk->user_private())); private_data->is_head = true; private_data->logdev_id = m_logdev_id; + if (log_idx != -1) { + // Write recovery hint so do_load can handle a crash between this truncation and + // the subsequent logdev meta persist. Only written when log_idx is known. + private_data->head_version = JournalChunkPrivate::JOURNAL_HEAD_VERSION; + private_data->head_magic = JournalChunkPrivate::JOURNAL_HEAD_MAGIC; + private_data->head_start_offset = truncate_offset; + private_data->head_start_idx = log_idx; + } m_vdev.update_chunk_private(new_head_chunk, private_data); // Find all chunks which needs to be removed from the start of m_journal_chunks. @@ -807,6 +815,11 @@ nlohmann::json JournalVirtualDev::Descriptor::get_status(int log_level) const { return j; } +const JournalChunkPrivate* JournalVirtualDev::Descriptor::head_chunk_private() const { + if (m_journal_chunks.empty()) { return nullptr; } + return r_cast< const JournalChunkPrivate* >(m_journal_chunks.front()->user_private()); +} + std::string JournalVirtualDev::Descriptor::to_string() const { off_t tail = static_cast< off_t >(data_start_offset() + m_write_sz_in_total.load(std::memory_order_relaxed)) + m_reserved_sz; diff --git a/src/lib/device/journal_vdev.hpp b/src/lib/device/journal_vdev.hpp index 460db4012..345045b3a 100644 --- a/src/lib/device/journal_vdev.hpp +++ b/src/lib/device/journal_vdev.hpp @@ -35,11 +35,21 @@ using journal_id_t = uint64_t; // Each log device has a list of journal chunk data with next_chunk. // Journal vdev will arrange the chunks in order during recovery. struct JournalChunkPrivate { + // Fields below are the original on-disk layout (must stay at these offsets). logdev_id_t logdev_id{0}; bool is_head{false}; // Is it the head element. uint64_t created_at{0}; // Creation timestamp uint64_t end_of_chunk{0}; // The offset indicates end of chunk. chunk_num_t next_chunk{0}; // Next chunk in the list. + + // Head chunk recovery hint appended after the original fields. + // Only valid when is_head==true and head_magic==JOURNAL_HEAD_MAGIC. + static constexpr uint32_t JOURNAL_HEAD_VERSION{1u}; + static constexpr uint32_t JOURNAL_HEAD_MAGIC{0xBEEFCAFE}; + uint32_t head_version{0}; + uint32_t head_magic{0}; + off_t head_start_offset{0}; // logdev truncate offset persisted with head + logid_t head_start_idx{0}; // logdev truncate idx persisted with head }; static_assert(sizeof(JournalChunkPrivate) <= chunk_info::user_private_size, "Journal private area bigger"); @@ -286,7 +296,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : return the new data start offset after truncation. */ - off_t truncate(off_t truncate_offset); + off_t truncate(off_t truncate_offset, logid_t log_idx = -1); /** * @brief : get the total size in journal @@ -332,6 +342,8 @@ class JournalVirtualDev : public VirtualDev { logdev_id_t logdev_id() const { return m_logdev_id; } + const JournalChunkPrivate* head_chunk_private() const; + std::string to_string() const; private: diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 60d269509..609902a00 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -233,6 +233,46 @@ void LogDev::do_load(off_t device_cursor) { break; } + if (loaded_from == -1 && header->start_idx() > m_log_idx) { + // Corner-case recovery: a SIGKILL landed after vdev chunk truncation (durable) but + // before logdev meta persist. The chunk list reflects the new head; logdev meta + // still carries the old (dev_offset, log_idx). We validate via the recovery hint + // written atomically into the head chunk during truncation and then advance m_log_idx + // to match the first log group the stream reader actually found. + THIS_LOGDEV_LOG(WARN, + "LogDev {} corner-case recovery: header->start_idx()={} > m_log_idx={} " + "(stale meta dev_offset={}); validating head chunk recovery hint", + m_logdev_id, header->start_idx(), m_log_idx.load(), device_cursor); + + auto const* hint = m_vdev_jd->head_chunk_private(); + HS_REL_ASSERT(hint, "logdev={} corner-case recovery: head chunk hint is missing; possible data corruption", + m_logdev_id); + HS_REL_ASSERT(hint->is_head && hint->head_magic == JournalChunkPrivate::JOURNAL_HEAD_MAGIC && + hint->head_version == JournalChunkPrivate::JOURNAL_HEAD_VERSION, + "logdev={} corner-case recovery: start_idx={} > m_log_idx={} but head chunk hint is invalid " + "(magic=0x{} version={} is_head={}); possible data corruption", + m_logdev_id, header->start_idx(), m_log_idx.load(), to_hex(hint->head_magic), + hint->head_version, hint->is_head); + HS_REL_ASSERT_GT(hint->head_start_idx, m_log_idx.load(), + "logdev={} corner-case recovery: hint->head_start_idx={} must be > stale m_log_idx={}", + m_logdev_id, hint->head_start_idx, m_log_idx.load()); + + THIS_LOGDEV_LOG( + WARN, "LogDev {} corner-case recovery: advancing m_log_idx {} -> {} data_start_offset {} -> {}", + m_logdev_id, m_log_idx.load(), hint->head_start_idx, device_cursor, hint->head_start_offset); + + // Scope and known limitations of this recovery: + // - This fix is scoped to logstream positioning only. It does NOT replay the full set + // of meta changes that the aborted LogDev::truncate() had in-flight, specifically: + // logstore superblock updates (start LSN), unreserve-store, and rollback-info cleanup + // (log_dev.cpp LogDev::truncate()). Those changes may not have been persisted. + m_log_idx.store(hint->head_start_idx, std::memory_order_release); + m_vdev_jd->update_data_start_offset(hint->head_start_offset); + device_cursor = hint->head_start_offset; + do_load(device_cursor); + return; + } + THIS_LOGDEV_LOG(DEBUG, "Found log group header offset=0x{} header {}", to_hex(group_dev_offset), *header); HS_REL_ASSERT_EQ(header->start_idx(), m_log_idx.load(), "log indx is not the expected one"); if (loaded_from == -1) { loaded_from = header->start_idx(); } @@ -665,7 +705,21 @@ uint64_t LogDev::truncate() { uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); // Truncate them in vdev - m_vdev_jd->truncate(min_safe_ld_key.dev_offset); + m_vdev_jd->truncate(min_safe_ld_key.dev_offset, min_safe_ld_key.idx); + +#ifdef _PRERELEASE + // Simulate a crash between vdev chunk truncation (durable, sync_write) and logdev meta + // persist, reproducing the inconsistency that do_load's defensive start_idx check handles. + if (hs()->crash_simulator().crash_if_flip_set("abort_logdev_truncate_before_meta_persist")) { + THIS_LOGDEV_LOG(INFO, + "CRASH FLIP fired: vdev truncated to dev_offset={} log_idx={} but meta NOT persisted. " + "Stale meta on disk: dev_offset={} log_idx={}", + min_safe_ld_key.dev_offset, min_safe_ld_key.idx, m_logdev_meta.get_start_dev_offset(), + m_logdev_meta.get_start_log_idx()); + decr_pending_request_num(); + return num_records_to_truncate; + } +#endif // Update the start offset to be read upon restart m_last_truncate_idx = min_safe_ld_key.idx; diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 45ecee96f..c13c6141d 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -184,6 +184,29 @@ class LogDevTest : public ::testing::Test { } } + // Like insert_batch_sync but always allocates via malloc with exactly data_size payload bytes, + // giving deterministic log-group sizes needed for chunk-boundary geometry tests. + void insert_batch_fixed(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, + uint32_t data_size) { + std::vector< uint8_t* > bufs; + bufs.reserve(batch); + for (int64_t i = 0; i < batch; ++i) { + uint32_t const total = sizeof(test_log_data) + data_size; + auto* raw = static_cast< uint8_t* >(std::malloc(total)); + auto* d = new (raw) test_log_data(); + d->size = data_size; + const char c = static_cast< char >(((lsn + i) % 94) + 33); + std::memset(d->get_data(), c, data_size); + bufs.push_back(raw); + log_store->write_async(lsn + i, {uintptr_cast(raw), total, false}, nullptr, nullptr); + } + log_store->flush(); + lsn += batch; + for (auto* raw : bufs) { + std::free(raw); + } + } + void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch, uint32_t fixed_size = 0) { auto last = cur_lsn + batch; @@ -797,6 +820,117 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { } } +#ifdef _PRERELEASE +TEST_F(LogDevTest, TruncateChunkMetaInconsistency) { + // Reproduces a crash on restart triggered by a SIGKILL arriving between: + // (1) m_vdev_jd->truncate() — persists new chunk head to disk via sync_write (durable) + // (2) m_logdev_meta.persist() — persists logdev superblock (never reached) + // + // On restart the chunk list reflects the new head chunk (from (1)) but logdev meta still + // holds the old (dev_offset, log_idx) pair (from before (2)). offset_to_chunk() maps the + // stale dev_offset against the new chunk list into the new head chunk where higher-indexed + // log data resides, causing do_load() to fire HS_REL_ASSERT_EQ(start_idx, m_log_idx). + LOGINFO("Step 1: Create a single logdev and logstore"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + const auto store_id = log_store->get_store_id(); + + // Geometry (all sizes assume flush_size_multiple=4096, chunk_size=8MB=8,388,608B): + // + // Each 4096B payload record becomes 4100B total (sizeof(test_log_data)=4 + 4096). + // 4100 % 4096 = 4 != 0 and !data.is_aligned() => always inlined (no OOB zero-copy). + // A batch of 500 records produces 3 log groups: + // LG1 (202rec): round_up(48 + 202x20 + 202x4100 + 24, 4096) = round_up(832,312, 4096) = 835,584B (=204x4096) + // LG2 (202rec): 835,584B + // LG3 ( 96rec): round_up(48 + 96x20 + 96x4100 + 24, 4096) = round_up(395,592, 4096) = 397,312B (= 97x4096) + // Total per batch = 2,068,480B (=505x4096) + // We use insert_batch_fixed (always malloc, never iobuf_alloc) to guarantee exact sizes. + // + // Step 2: 5 batches = 2,500 records (logdev idx 0-2499) + // Batches 1-4: 4 x 2,068,480 = 8,273,920B fills chunk_0 (gap=114,688B to chunk end). + // Batch 5 overflows chunk_0 (114,688B gap fills remainder), continues in chunk_1: + // LG1 at chunk_1[ 0] (idx 2000-2201) + // LG2 at chunk_1[ 835,584] (idx 2202-2403) + // LG3 at chunk_1[1,671,168] (idx 2404-2499) + // m_last_flush_ld_key = {2404, X1=10,059,776} (LG3 of batch 5 in chunk_1). + // + // Step 3 truncation persists X1=10,059,776, Y1=2404 to meta; chunk_0 released, chunk_1 kept. + // + // Step 4: 4 batches = 2,000 records (logdev idx 2500-4499) + // Batches 1-3 continue in chunk_1 from [2,068,480] to [8,273,920]. Batch 4 crosses into + // chunk_2 (114,688B gap fills remainder of chunk_1). In chunk_2 this batch writes: + // LG1 at chunk_2[ 0] (idx 4000-4201) + // LG2 at chunk_2[ 835,584] (idx 4202-4403) + // LG3 at chunk_2[1,671,168] (idx 4404-4499) + // m_last_flush_ld_key = {4404, X2=18,448,384} (LG3 of batch 4 in chunk_2). + // + // Step 7: second truncation targets X2=18,448,384. vdev persists chunk_2 as head (durable), + // releases chunk_1, then flip fires before meta.persist(). On disk: chunk_2 is head but + // meta still holds stale (X1, Y1). + // + // Recovery: update_data_start_offset(X1=10,059,776) with chunk_list=[chunk_2]. + // offset_to_chunk(X1=10,059,776): + // chunk_aligned = round_down(10,059,776, 8MB) = 8,388,608 + // off_l = 10,059,776 - 8,388,608 = 1,671,168 => chunk_2[1,671,168] + // That is exactly the start of LG3 of step-4 batch 4: magic valid, start_idx=4404. + // m_log_idx from stale meta = Y1 = 2404. + // HS_REL_ASSERT_EQ(4404, 2404) fires => crash reproduced. + constexpr uint32_t entry_size = 4096; + constexpr int64_t batch_size = 500; + constexpr int num_batches_step2 = 5; + + LOGINFO("Step 2: Write {} deterministic entries ({}x{}) to fill chunk 0 and land X1 inside chunk 1", + batch_size * num_batches_step2, num_batches_step2, batch_size); + logstore_seq_num_t cur_lsn = 0; + for (int i = 0; i < num_batches_step2; ++i) { + insert_batch_fixed(log_store, cur_lsn, batch_size, entry_size); + } + + LOGINFO("Step 3: Normal chunk-crossing truncation (no flip) — chunk 0 released, logdev meta persisted"); + truncate_validate(log_store); + + // Step 4: 4 more batches continue in chunk_1 then cross into chunk_2. The last log group of + // step 4 (LG3 of batch 4) lands at chunk_2[1,671,168], matching what offset_to_chunk maps + // stale X1=10,059,776 to when only chunk_2 is present after the step-7 crash. + constexpr int num_batches_step4 = 4; + LOGINFO("Step 4: Write {} more deterministic entries ({}x{}) to fill chunk 1 and allocate chunk 2", + batch_size * num_batches_step4, num_batches_step4, batch_size); + for (int i = 0; i < num_batches_step4; ++i) { + insert_batch_fixed(log_store, cur_lsn, batch_size, entry_size); + } + + LOGINFO("Step 5: Register restart callback to re-open logdev and logstore after crash recovery"); + std::promise< bool > reopen_done; + m_helper.change_start_cb([&logdev_id, &store_id, &log_store, &reopen_done]() { + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); + logstore_service() + .open_log_store(logdev_id, store_id, false /* append_mode */) + .thenValue([&log_store, &reopen_done](auto store) { + log_store = store; + reopen_done.set_value(true); + }); + }); + + LOGINFO("Step 6: Inject flip to simulate SIGKILL after vdev chunk truncation, before logdev meta persist"); + m_helper.set_basic_flip("abort_logdev_truncate_before_meta_persist"); + + LOGINFO("Step 7: Trigger second chunk-crossing truncation to fire the flip"); + // Pass false so truncate() calls m_logdev->truncate(), where the flip fires: + // m_vdev_jd->truncate() completes (sync_write, durable) but m_logdev_meta.persist() is + // bypassed. LogDev::stop() skips its healing truncate when is_crashed(), preserving + // the on-disk inconsistency for the restart to exercise. + log_store->truncate(log_store->get_contiguous_completed_seq_num(-1), false /* in_memory_truncate_only */); + + LOGINFO("Step 8: Wait for crash recovery (restart_homestore running in background thread)"); + m_helper.wait_for_crash_recovery(); + reopen_done.get_future().get(); // wait for logstore replay to complete + + LOGINFO("Step 9: Restart succeeded — no do_load assertion fired; log store is valid"); + ASSERT_NE(log_store, nullptr) << "Log store must be valid after crash recovery"; +} +#endif + SISL_OPTION_GROUP(test_log_dev, (num_logdevs, "", "num_logdevs", "number of log devs", ::cxxopts::value< uint32_t >()->default_value("4"), "number"),