From 6fe0959821aa5adb012128ee04de669ab338c2cf Mon Sep 17 00:00:00 2001
From: yawzhang <yawzhang@ebay.com>
Date: Thu, 18 Jun 2026 17:13:47 +0800
Subject: [PATCH] SDSTOR-22293: handle logdev recovery corner case

also add relevant knowledge to docs/structures to support AI in future triaging
---
 conanfile.py                                  |    2 +-
 ...ogdev-truncate-chunk-meta-inconsistency.md |  142 +++
 docs/structures/raft.md                       | 1031 +++++++++++++++++
 docs/structures/raft_repl_dev_log_dev.md      |  532 +++++++++
 src/lib/device/journal_vdev.cpp               |   15 +-
 src/lib/device/journal_vdev.hpp               |   14 +-
 src/lib/logstore/log_dev.cpp                  |   56 +-
 src/tests/test_log_dev.cpp                    |  134 +++
 8 files changed, 1922 insertions(+), 4 deletions(-)
 create mode 100644 docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md
 create mode 100644 docs/structures/raft.md
 create mode 100644 docs/structures/raft_repl_dev_log_dev.md

diff --git a/conanfile.py b/conanfile.py
index 87a52abba..4ec340dfb 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -9,7 +9,7 @@
 
 class HomestoreConan(ConanFile):
     name = "homestore"
-    version = "7.5.10"
+    version = "7.5.11"
 
     homepage = "https://github.com/eBay/Homestore"
     description = "HomeStore Storage Engine"
diff --git a/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md b/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md
new file mode 100644
index 000000000..8367dee6f
--- /dev/null
+++ b/docs/incidents/bug-logdev-truncate-chunk-meta-inconsistency.md
@@ -0,0 +1,142 @@
+# RCA: LogDev Crash on Restart — Truncation Chunk/Meta Inconsistency
+
+**Date:** 2026-05-22
+**Component:** HomeStore / LogStore / JournalVirtualDev
+**Severity:** Critical (crash on restart, blocks recovery)
+
+---
+
+## Symptom
+
+After a SIGKILL, the process crashes during restart at `log_dev.cpp:234`:
+
+```
+Assertion failure: Expected '473577' to be == to '460780'
+log indx is not the expected one
+```
+
+`do_load` reads the first log group from the persisted start dev_offset
+(`0x54BC8000`) and finds `start_idx=473577`, but the logdev superblock says
+`log_idx=460780`.
+
+---
+
+## Root Cause Chain
+
+### Condition 1: `m_vdev_jd->truncate()` persists chunk metadata synchronously
+
+`JournalVirtualDev::Descriptor::truncate()` calls `update_chunk_private()` for
+every chunk it modifies. The call chain:
+
+```
+update_chunk_private()
+  → chunk->set_user_private()
+    → write_chunk_info()
+      → physical_dev->write_super_block()
+        → m_drive_iface->sync_write()   ← synchronous disk write
+```
+
+Specifically, truncation:
+1. Marks the new head chunk `is_head=true` — **sync_write**
+2. Calls `release_chunk_to_pool()` on each removed chunk, which clears its
+   private data — **sync_write** per chunk
+
+All writes complete before `truncate()` returns.
+**Evidence:** `physical_dev.cpp:122` — `sync_write` call, no async path.
+
+### Condition 2: `m_logdev_meta.persist()` is called after `m_vdev_jd->truncate()` returns
+
+`LogDev::truncate()` (`log_dev.cpp:665–688`):
+
+```cpp
+m_vdev_jd->truncate(min_safe_ld_key.dev_offset);   // chunk state durable
+
+// ← SIGKILL WINDOW OPENS HERE
+
+m_logdev_meta.set_start_dev_offset(..., false);     // in-memory only
+m_logdev_meta.unreserve_store(..., false);           // in-memory only
+m_logdev_meta.remove_rollback_record_upto(..., false); // in-memory only
+
+m_logdev_meta.persist();    // ← logdev superblock written; never reached
+```
+
+When `stopping=false` (normal operation), all intermediate calls use
+`persist_now=false`. Only the final `persist()` writes to disk.
+**Evidence:** code structure, `log_dev.cpp:665–688`.
+
+### Condition 3: On restart, stale logdev meta is interpreted against a new chunk list
+
+`JournalVirtualDev::init()` rebuilds the chunk list from on-disk chunk private
+data (not from logdev meta). After the truncation in Condition 1, the new head
+chunk is persisted. On restart:
+
+- **Chunk list (from disk):** `[Chunk B (is_head=true), ...]`
+  (Chunk A was released and its private data cleared/persisted to disk)
+- **Logdev meta (from disk):** `(dev_offset=0x54BC8000, log_idx=460780)`
+  (old value, persist never ran)
+
+`LogDev::start()` calls:
+```cpp
+m_vdev_jd->update_data_start_offset(0x54BC8000);  // set from stale meta
+m_log_idx = 460780;
+do_load(0x54BC8000);
+```
+
+### Condition 4: `offset_to_chunk(0x54BC8000)` maps to Chunk B in the new list
+
+`journal_vdev.cpp:727`:
+```cpp
+chunk_aligned_offset = round_down(m_data_start_offset, chunk_size);
+off_l = 0x54BC8000 - chunk_aligned_offset;
+// Iterates [Chunk B, ...] → maps to Chunk B at offset off_l
+```
+
+The old offset `0x54BC8000` is now interpreted against the new chunk list. It
+maps to a physical position inside Chunk B, where log data starting at
+`start_idx=473577` resides.
+
+### Condition 5: `do_load` has no handler for `header->start_idx() > m_log_idx`
+
+`log_dev.cpp:225–234`:
+```cpp
+if (loaded_from == -1 && header->start_idx() < m_log_idx) {
+    // handles: journal fully truncated → break
+}
+// No handling for start_idx > m_log_idx ← the actual case (473577 > 460780)
+HS_REL_ASSERT_EQ(header->start_idx(), m_log_idx.load(), ...);  // CRASH
+```
+
+The assert fires because the gap case (truncation advanced further than what
+is recorded in meta) is unhandled.
+
+---
+
+## Trigger Conditions
+
+The following sequence must all occur:
+
+1. LogDev has at least two chunks in its journal chunk list
+2. A truncation is triggered (raft compact or resource pressure) where
+   `min_safe_ld_key.dev_offset` falls in chunk N+1 or later (i.e., the
+   truncation point crosses a chunk boundary, causing chunk N to be released)
+3. SIGKILL arrives after `m_vdev_jd->truncate()` returns but before
+   `m_logdev_meta.persist()` executes
+
+The window is narrow (microseconds to low milliseconds) but non-zero, and
+SIGKILL is unblockable.
+
+**Observed instance (2026-05-10):**
+- `log_dev=1`, pre-kill flush at `log_idx=490277`
+- Truncation with `min_safe_ld_key.idx=475195`
+- Persisted meta: `(dev_offset=0x54BC8000, log_idx=460780)`
+- First log in new chunk list: `start_idx=473577`
+
+---
+
+## Skipped Conditions
+
+None. All four conditions confirmed via static code analysis and log evidence.
+The chunk list change (Condition 3) is proven by the crash itself via proof by
+contradiction: if the chunk list had not changed, `offset_to_chunk(0x54BC8000)`
+would map to the original Chunk A and recover log 460780 correctly; the
+observed mismatch is impossible without a chunk list change.
diff --git a/docs/structures/raft.md b/docs/structures/raft.md
new file mode 100644
index 000000000..9f5297a34
--- /dev/null
+++ b/docs/structures/raft.md
@@ -0,0 +1,1031 @@
+# RAFT Replication Log Management
+
+This document provides a detailed explanation of RAFT log management in HomeStore, covering the architecture, append flow, compact/truncate flow, and rollback flow. It serves as a reference for code understanding and issue analysis.
+
+## Architecture Overview
+
+### Component Stack and Responsibilities
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ NuRaft (nuraft library)                                     │
+│ - Manages RAFT consensus protocol                           │
+│ - Calls log_store interface to persist/retrieve logs        │
+└─────────────────────────────────────────────────────────────┘
+                            │
+                            │ nuraft::log_store interface
+                            ▼
+┌─────────────────────────────────────────────────────────────┐
+│ ReplLogStore                                                │
+│ - Implements nuraft::log_store interface                    │
+│ - Manages repl_req lifecycle and LSN mapping                │
+│ - Coordinates data channel and raft channel                 │
+│ - Inherits from HomeRaftLogStore                            │
+└─────────────────────────────────────────────────────────────┘
+                            │
+                            │ Delegates to parent class
+                            ▼
+┌─────────────────────────────────────────────────────────────┐
+│ HomeRaftLogStore                                            │
+│ - Converts between RAFT LSN and LogStore LSN               │
+│ - Serializes/deserializes nuraft log entries               │
+│ - Caches log entries for fast access                        │
+│ - Contains m_log_store (HomeLogStore)                       │
+└─────────────────────────────────────────────────────────────┘
+                            │
+                            │ Uses HomeLogStore for storage
+                            ▼
+┌─────────────────────────────────────────────────────────────┐
+│ HomeLogStore                                                │
+│ - Manages log sequence numbers (LSNs)                       │
+│ - Tracks logdev_key for each LSN                            │
+│ - Maintains truncation boundaries                           │
+│ - One logstore per RAFT log                                 │
+└─────────────────────────────────────────────────────────────┘
+                            │
+                            │ Writes to LogDev
+                            ▼
+┌─────────────────────────────────────────────────────────────┐
+│ LogDev                                                      │
+│ - Physical log device managing actual disk writes           │
+│ - Batches multiple log records into LogGroups               │
+│ - Assigns logdev_key {log_idx, dev_offset} to each record  │
+│ - Currently: one LogDev per HomeLogStore (1:1 mapping)      │
+│ - Performs physical truncation on journal vdev              │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### LSN and Key Concepts
+
+#### LSN (Log Sequence Number) Mapping
+- **RAFT LSN** (repl_lsn): Log index in RAFT, starts from 1
+- **LogStore LSN** (store_lsn): Sequence number in HomeLogStore = `repl_lsn - 1`
+- **Conversion formula**: `store_lsn = repl_lsn - 1`
+- **Example**: RAFT LSN 100 → LogStore LSN 99
+
+#### LogDev Key Structure
+```cpp
+struct logdev_key {
+    logid_t idx;        // Log index in LogDev
+    off_t dev_offset;   // Physical offset on journal device
+}
+```
+- Each log written to LogDev gets a unique logdev_key
+- The `idx` is assigned sequentially by LogDev, independent of store_lsn
+- The `dev_offset` is the actual byte offset on the physical journal device
+
+#### Replication Request (repl_req)
+Represents a single write operation in the replication pipeline:
+- **Contents**: user header, key, data buffer, LSN
+- **State tracking**: DATA_WRITTEN, LOG_FLUSHED, etc.
+- **Creation timing**:
+  - **Leader**: Created in `async_alloc_write` before proposing to RAFT
+  - **Follower**: Created when first receiving the entry, via two possible paths:
+    - **RAFT channel**: In `ReplLogStore::append` when receiving RAFT log entry
+    - **Data channel**: In `on_push_data_received` when receiving actual data
+    - Whichever arrives first creates the repl_req; the other finds it already exists
+
+---
+
+## Append Flow
+
+The append flow describes how a write operation is replicated and persisted. It involves two parallel channels:
+- **Data Channel**: Transfers actual user data from leader to followers
+- **RAFT Channel**: Transfers RAFT log entries for consensus
+
+### High-Level Flow
+
+```
+Leader:
+  1. async_alloc_write (allocate blocks, write data)
+  2. raft_server()->append_entries_ext() (propose to RAFT)
+  3. RAFT replicates log entries to all members
+
+All Members (Leader & Followers):
+  4. ReplLogStore::append (receive log entry)
+  5. HomeRaftLogStore::append (add to logstore in-memory)
+  6. LogDev::append_async (add to pending flush queue)
+  7. ReplLogStore::end_of_append_batch (flush to disk)
+  8. LogDev::flush (physical write)
+```
+
+---
+
+### Step 1: Leader Proposes to RAFT
+
+**Entry Point**: `RaftReplDev::async_alloc_write`
+
+**What happens**:
+1. Leader receives write request with header, key, and data
+2. Allocates blocks from data service
+3. Writes actual data to allocated blocks
+4. Creates `repl_req` to track this write operation
+5. Serializes write info into RAFT log entry
+6. Calls `raft_server()->append_entries_ext()` to propose entry to RAFT
+
+**Key Point**: Leader writes data BEFORE proposing to RAFT. When log is flushed later, data is already on disk.
+
+**Output**: RAFT begins replicating this log entry to followers
+
+---
+
+### Step 2: RAFT Replicates to All Members
+
+**What happens**:
+- NuRaft library handles replication via network
+- All members (including leader) receive log entries via `append()` callback
+- RAFT batches multiple log entries together for efficiency
+
+**Key Point**: From this point on, leader and followers follow the same append path through logstore.
+
+---
+
+### Step 3: Receive Log Entry in ReplLogStore
+
+**Entry Point**: `ReplLogStore::append`
+
+**What happens**:
+1. Check if entry is app_log (skip config entries)
+2. **Create or retrieve repl_req**:
+   - **For Leader**: repl_req already exists from Step 1
+   - **For Follower**: Create repl_req now (first time seeing this entry)
+     - Can be triggered by **RAFT channel** (this function) OR **data channel** (`on_push_data_received`)
+     - Whichever arrives first creates the repl_req
+3. Link the LSN to repl_req for future lookup
+4. Delegate to parent class `HomeRaftLogStore::append`
+
+**Output**: Returns RAFT LSN (repl_lsn)
+
+---
+
+### Step 4: Add to LogStore (In-Memory)
+
+**Entry Point**: `HomeRaftLogStore::append`
+
+**What happens**:
+1. Serialize nuraft log entry to buffer format:
+   ```
+   [8 bytes: term][1 byte: log_val_type][N bytes: user data]
+   ```
+2. Convert RAFT LSN to LogStore LSN: `store_lsn = repl_lsn - 1`
+3. Call `HomeLogStore::append_async` with serialized buffer
+4. Cache the log entry at position `lsn % cache_size` for fast access
+5. Return RAFT LSN
+
+**Key Point**: Log is still in MEMORY, not on disk yet.
+
+**Output**: Log entry added to logstore's in-memory pending queue
+
+---
+
+### Step 5: Enqueue to LogDev (In-Memory)
+
+**Entry Point**: `LogDev::append_async`
+
+**What happens**:
+1. Create `log_record` structure:
+   ```cpp
+   log_record {
+       store_id: which logstore this belongs to
+       seq_num: logstore sequence number (from HomeLogStore)
+       data: serialized log entry
+       context: pointer to logstore_req
+   }
+   ```
+2. Assign a unique log_idx (LogDev's internal sequence number)
+3. Add record to `m_log_records` stream tracker (in-memory queue)
+4. Increment `m_pending_flush_size` by data size
+5. Return the seq_num to caller
+
+**Two Index Values**:
+- **log_idx returned by write_at inside this function**: LogDev's internal index, **NOT USED** 
+- **seq_num in log_record**: This is the logstore's LSN (from `m_next_lsn`), which is the actual LSN used
+
+**Key Point**: Multiple logstores can share one LogDev. Each record is tagged with store_id.
+
+**Output**: log_idx assigned, waiting for batch flush
+
+---
+
+### Step 6: End of Append Batch - Coordination
+
+**Entry Point**: `ReplLogStore::end_of_append_batch`
+
+This is called by NuRaft after appending a batch of entries (RAFT channel callback).
+
+**What happens**:
+
+1. **Separate requests into two groups**:
+   - **Proposer requests** (leader's own writes): Data already written in Step 1
+   - **Non-proposer requests** (follower's received writes): Data may not be written yet
+
+2. **For non-proposer requests (followers)**:
+   - Call `notify_after_data_written()` to check if data is written
+   - Future completes when data is confirmed written to data service
+   - **Critical**: MUST ensure data write completion before log flush
+
+3. **Flush logs to disk**:
+   - Call `HomeRaftLogStore::end_of_append_batch`
+   - This triggers actual disk flush
+
+4. **Mark completion**:
+   - Mark all repl_reqs as `LOG_FLUSHED`
+   - Set `is_volatile = false` on all repl_reqs
+
+**Why wait for data?**
+If logs are flushed before data is written, upon recovery we'd replay logs but data would be missing, causing data inconsistency.
+
+**Output**: Logs are now durable on disk, data is written
+
+---
+
+### Step 7: Flush LogStore
+
+**Entry Point**: `HomeRaftLogStore::end_of_append_batch`
+
+**What happens**:
+1. Calculate end LSN: `end_lsn = store_lsn(start + count - 1)`
+2. Call `m_log_store->flush(end_lsn)`
+3. Update `m_last_durable_lsn = end_lsn`
+
+**Entry Point**: `HomeLogStore::flush`
+
+**What happens**:
+- Delegates to LogDev to perform actual flush
+- No-op if already flushed
+
+---
+
+### Step 8: Physical Flush to Device
+
+**Entry Point**: `LogDev::flush`
+
+This is where logs physically hit the disk.
+
+**What happens**:
+
+1. **Gather pending records**:
+   - Get all log records from `m_log_records` between `m_last_flush_idx` and current `m_log_idx`
+   - These are the in-memory log records waiting to be flushed
+
+2. **Create LogGroup**:
+   - LogGroup is the unit of physical write
+   - Contains header, multiple log records, inline data, OOB data, and footer
+   - Format (based on code comments in log_dev.hpp):
+     ```
+     [LogGroup Header: #records, oob_area_offset, inline_area_offset, ...]
+     [Record 1: size, offset, is_inlined, store_seq_num, store_id]
+     [Record 2: size, offset, is_inlined, store_seq_num, store_id]
+     ...
+     [Inline Data Area: data for small records]
+     [OOB (Out-of-Band) Data Area: data for large records]
+     [Footer: magic, start_log_idx]
+     ```
+
+   **Inline vs OOB**:
+   - Small records: Data stored in inline area
+   - Large records: Data stored in OOB area
+   - Record's `is_inlined` flag indicates which area to read from
+
+3. **Allocate space on journal vdev**:
+   - Call `m_vdev_jd->alloc_next_append_blk(total_size)`
+   - Returns physical offset on journal device
+   - Store this in `lg->m_log_dev_offset`
+
+4. **Assign logdev_key to each record**:
+   **All records in same LogGroup share the same dev_offset**
+     ```cpp
+     logdev_key {
+         idx: unique log_idx for this record (from LogDev sequence)
+         dev_offset: lg->m_log_dev_offset (SAME for entire LogGroup) 
+     }
+     ```
+
+5. **Write to physical device**:
+   - Prepare iovecs (scatter-gather list) from LogGroup
+   - Call `m_vdev_jd->sync_pwritev(iovecs, offset)`
+   - This is a synchronous write ensuring data is on disk
+
+6. **Update tracking**:
+   - Update `m_last_flush_idx` to last flushed log_idx
+   - Update `m_last_flush_ld_key` with the logdev_key of last flushed log
+   - Decrement `m_pending_flush_size`
+
+7. **Notify completion**:
+   - For each log record in LogGroup:
+     - Find corresponding HomeLogStore via store_id
+     - Call logstore's `on_write_completion(req, logdev_key, flush_ld_key)`
+     - LogStore updates its in-memory mapping: `seq_num → {logdev_key, trunc_key}`
+     - Call user's completion callback
+
+**Key Points**:
+- Multiple log records are batched into one LogGroup for efficiency
+- Each log gets a unique logdev_key for later retrieval
+- Writes are synchronous (blocking) to ensure durability
+
+**Output**: Logs are now physically durable on journal device
+
+---
+
+### Follower Data Channel Flow
+
+Followers receive data via data channel in parallel with raft channel.
+
+**Entry Point**: `RaftReplDev::on_push_data_received`
+
+**What happens**:
+1. Leader calls `push_data_to_all_followers` after Step 1
+2. Follower receives push data RPC
+3. Create or retrieve repl_req (may already exist from RAFT channel)
+4. Save pushed data buffer in repl_req
+5. Write data to data service: `data_service().async_write()`
+6. Mark repl_req as `DATA_WRITTEN` and fulfill promise
+7. If RAFT channel is waiting in Step 6, it now proceeds
+
+**Race Handling**:
+- **Data arrives first**: Data written, RAFT channel finds data ready
+- **RAFT arrives first**: RAFT channel waits for data via future
+- Either order works correctly
+
+---
+
+## Compact/Truncate Flow
+
+Compact/truncate reclaims space from old logs that have been snapshotted and are no longer needed for recovery or replication.
+
+### Trigger and Context
+
+**When it happens**:
+- RAFT creates snapshot periodically
+- After snapshot completes, RAFT calls `compact(upto_lsn)` to remove old logs
+- Manually triggered via `trigger_snapshot_creation`
+
+**Key Constraint - truncation_upper_limit**:
+- Calculated by leader based on all followers' replication progress
+- Definition: `max LSN that can be safely truncated without affecting any follower`
+- Computed in `RaftReplDev::propose_truncate_boundary`:
+  ```
+  minimum_repl_idx = min(follower1_idx, follower2_idx, ...)
+  truncation_upper_limit = max(commit_lsn - reserve_threshold, minimum_repl_idx)
+  ```
+- Ensures no follower needs logs we're about to delete
+
+---
+
+### Step 1: Compute Effective Compact LSN
+
+**Entry Point**: `ReplLogStore::compact`
+
+**What happens**:
+1. RAFT wants to compact up to `compact_upto_lsn` (from snapshot LSN)
+2. Get current `truncation_upper_limit` from ReplDev
+3. Compute safe compact point:
+   ```cpp
+   effective_compact_lsn = min(compact_upto_lsn, truncation_upper_limit)
+   ```
+4. Call `m_rd.on_compact(effective_compact_lsn)`:
+   - Updates ReplDev's `m_compact_lsn` to effective_compact_lsn
+   - This value will be persisted later during checkpoint flush
+5. Proceed to compact LogStore: `HomeRaftLogStore::compact(effective_compact_lsn)`
+
+**Why limit by truncation_upper_limit?**
+- RAFT snapshot may be at LSN 10000
+- But a slow follower may only be at LSN 8000
+- We can't truncate logs beyond 8000 or follower can't catch up
+- So effective_compact_lsn = min(10000, 8000) = 8000
+
+**Output**: Safe LSN determined for truncation
+
+---
+
+### Step 2: Compact in HomeRaftLogStore
+
+**Entry Point**: `HomeRaftLogStore::compact`
+
+**What happens**:
+1. Validate compact_lsn:
+   - Check if `compact_lsn > current_max_lsn`
+   - If true, log a warning (may indicate holes in log)
+   - Still proceed with truncation
+2. Convert RAFT LSN to LogStore LSN:
+   ```cpp
+   store_lsn = compact_lsn - 1
+   ```
+3. Call `m_log_store->truncate(store_lsn, false)`:
+   - `false` means perform physical truncation (not in-memory only)
+
+**Output**: Request to truncate logstore up to store_lsn
+
+---
+
+### Step 3: Truncate LogStore (In-Memory)
+
+**Entry Point**: `HomeLogStore::truncate`
+
+**What happens**:
+
+1. **Flush any pending logs first**:
+   - Call `flush()` to ensure all in-memory logs are on disk
+   - This prevents truncating logs that aren't yet durable
+
+2. **Determine truncation key**:
+   - If `upto_lsn <= m_tail_lsn` (normal case):
+     - Get record at `upto_lsn`: `rec = m_records.at(upto_lsn)`
+     - Extract `m_trunc_ld_key = rec.m_trunc_key`
+     - This logdev_key marks the truncation boundary
+   - If `upto_lsn > m_tail_lsn` (baseline resync case):
+     - Update `m_tail_lsn` and `m_next_lsn` to upto_lsn
+     - Insert empty record at upto_lsn
+     - This handles cases where snapshot LSN exceeds current logs
+
+3. **Truncate in-memory records**:
+   - Call `m_records.truncate(upto_lsn)`
+   - Removes all records with `seq_num <= upto_lsn` from memory
+   - These LSNs are now invalid for future reads
+
+4. **Update start LSN**:
+   - Set `m_start_lsn = upto_lsn + 1`
+   - Future log reads/writes must be >= upto_lsn + 1
+
+5. **Trigger physical truncation**:
+   - If `!in_memory_truncate_only`: call `m_logdev->truncate()`
+   - This propagates truncation to LogDev layer
+
+**Key State**:
+- `m_start_lsn`: First valid LSN in logstore (upto_lsn + 1)
+- `m_tail_lsn`: Last written LSN
+- `m_trunc_ld_key`: LogDev key for truncation boundary
+
+**Output**: In-memory state updated, physical truncation triggered
+
+---
+
+### Step 4: Truncate LogDev (Physical)
+
+**Entry Point**: `LogDev::truncate`
+
+**What happens**:
+
+1. **Lock order** (important for avoiding deadlock):
+   - Acquire flush_guard (prevents concurrent flush)
+   - Acquire store_map read lock (protects logstore map)
+   - Acquire meta_mutex (protects metadata updates)
+
+2. **Find minimum safe truncation point across all logstores**:
+   ```cpp
+   logdev_key min_safe_ld_key = out_of_bound;
+
+   for each logstore on this logdev:
+       (trunc_lsn, trunc_ld_key, tail_lsn) = logstore->truncate_info()
+
+       // Update logstore superblock with new start LSN
+       logstore_sb.m_first_seq_num = trunc_lsn + 1
+       persist(logstore_sb)
+
+       // Track minimum across all stores
+       if trunc_ld_key.idx < min_safe_ld_key.idx:
+           min_safe_ld_key = trunc_ld_key
+   ```
+
+   **Why find minimum?**
+   - One LogDev can serve multiple logstores (though currently 1:1)
+   - Can only truncate up to the slowest logstore's boundary
+   - Ensures no logstore loses its needed data
+
+3. **Handle edge cases**:
+   - If all logstores are empty: `min_safe_ld_key = m_last_flush_ld_key`
+   - If no truncation needed (min_safe_ld_key <= m_last_truncate_idx):
+     - Still persist logstore superblocks
+     - Return 0 (no space reclaimed)
+
+4. **Physical truncation on journal vdev**:
+   - Call `m_vdev_jd->truncate(min_safe_ld_key.dev_offset)`
+   - Reclaims space on journal device up to this offset
+   - All data before this offset is now freed
+
+5. **Update LogDev metadata**:
+   - Set new `start_dev_offset = min_safe_ld_key.dev_offset`
+   - Set new `start_log_idx = min_safe_ld_key.idx`
+   - Persist metadata via `m_logdev_meta.persist()`
+   - Update `m_last_truncate_idx = min_safe_ld_key.idx`
+
+6. **Return reclaimed records count**:
+   ```cpp
+   num_records = min_safe_ld_key.idx - m_last_truncate_idx
+   ```
+
+**LogDev Metadata Persistence**:
+The truncation info is stored in logdev superblock:
+```cpp
+struct logdev_superblk {
+    start_dev_offset: physical offset to start reading from
+    key_idx: log_idx to start reading from
+    ...
+}
+```
+
+During recovery (`LogDev::do_load`), logs are read from this offset forward.
+
+**Output**: Physical space reclaimed on journal device
+
+---
+
+### Step 5: Persist Compact LSN in Checkpoint
+
+**Entry Point**: `RaftReplDev::cp_flush`
+
+This happens during periodic checkpoint flush.
+
+**What happens**:
+1. Read current `m_compact_lsn` from ReplDev (updated in Step 1)
+2. Write to repl dev superblock:
+   ```cpp
+   m_rd_sb->compact_lsn = m_compact_lsn.load()
+   m_rd_sb.write()
+   ```
+3. Superblock is persisted to meta service
+
+**Why persist?**
+- On recovery, need to know what LSN was compacted to
+- Determines safe starting point for log replay
+- Prevents re-processing already-snapshotted logs
+
+**Checkpoint Flush Trigger**:
+- Periodic: triggered by CP manager
+- Manual: via `trigger_snapshot_creation`
+- Before snapshot: to ensure clean state
+
+**Output**: Compact LSN durable on disk
+
+---
+
+### Complete Flow Summary
+
+```
+Step 1: ReplLogStore::compact
+   - Limit by truncation_upper_limit
+   - effective_compact_lsn = min(snapshot_lsn, truncation_upper_limit)
+   - Update m_compact_lsn in ReplDev
+
+Step 2: HomeRaftLogStore::compact
+   - Convert repl_lsn to store_lsn
+
+Step 3: HomeLogStore::truncate
+   - Flush pending logs
+   - Get m_trunc_ld_key from record
+   - Truncate in-memory records
+   - Update m_start_lsn
+
+Step 4: LogDev::truncate
+   - Find min truncation point across all logstores
+   - Persist logstore superblocks
+   - Truncate journal vdev physically
+   - Persist logdev metadata
+
+Step 5: RaftReplDev::cp_flush (during checkpoint)
+   - Persist m_compact_lsn to repl dev superblock
+```
+
+---
+
+## Rollback Flow
+
+Rollback handles the case when a follower (or old leader) has uncommitted logs that conflict with the new leader's logs. This happens during leader elections.
+
+### When Rollback Occurs
+
+**Scenario**:
+1. Old leader appended logs at LSN 100-105 but crashed before committing
+2. New leader elected, has committed logs only up to LSN 99
+3. New leader sends its own logs starting at LSN 100 (different from old leader's)
+4. Follower (old leader) must rollback LSN 100-105 and accept new leader's logs
+
+**NuRaft's Two-Phase Process**:
+1. **Phase 1 - Rollback**: Rollback invalid logs in **reverse order** (105 → 104 → ... → 100)
+2. **Phase 2 - Overwrite**: Write new logs in **forward order** (100 → 101 → ...)
+
+---
+
+### Phase 1: Rollback Invalid Logs (Reverse Order)
+
+**Entry Point**: NuRaft calls `state_machine->rollback_ext()` for each log in reverse
+
+**NuRaft's Logic**:
+```cpp
+// Rollback from last log backwards to conflict point
+for (idx = my_last_log_idx; idx >= conflict_idx; idx--) {
+    log_entry = log_store_->entry_at(idx)
+    if (log_entry.type == app_log) {
+        state_machine_->rollback_ext(idx, log_entry.buffer)
+    } else if (log_entry.type == conf) {
+        state_machine_->rollback_config(idx, config)
+    }
+}
+```
+
+**What happens**:
+1. **For each log from 105 down to 100**:
+   - Read the log entry from logstore
+   - If app_log: Call `RaftStateMachine::rollback_ext(idx, buffer)`
+   - If config_log: Call `RaftStateMachine::rollback_config(idx, config)`
+
+2. **In RaftStateMachine::rollback_ext**:
+   - Notify application layer via listener: `m_listener->on_pre_commit_rollback(lsn, ...)`
+   - Application can undo any in-memory state changes
+   - Does NOT remove logs from logstore yet
+
+**Key Point**: This is a notification phase, allowing application to clean up state before logs are physically removed.
+
+---
+
+### Phase 2: Overwrite with New Logs (Forward Order)
+
+After rollback phase completes, NuRaft proceeds to overwrite.
+
+**Entry Point**: NuRaft calls `log_store_->write_at(index, entry)` for each new log
+
+**NuRaft's Logic**:
+```cpp
+// Overwrite logs in forward order
+while (log_idx < next_slot() && has_more_entries) {
+    entry = new_entries[count]
+    log_store_->write_at(log_idx, entry)
+
+    if (entry.type == app_log) {
+        state_machine_->pre_commit_ext(log_idx, entry.buffer)
+    }
+
+    log_idx++
+    count++
+}
+```
+
+**What happens**:
+1. **For each new log from 100 onwards**:
+   - Call `ReplLogStore::write_at(index, entry)`
+   - Call `RaftStateMachine::pre_commit_ext()` to notify application
+
+---
+
+### Step 1: Write At Specific Index
+
+**Entry Point**: `ReplLogStore::write_at`
+
+**What happens**:
+1. Check if entry is app_log (skip config entries)
+2. Create or retrieve repl_req from journal entry
+3. Link LSN to repl_req
+4. Delegate to `HomeRaftLogStore::write_at(index, entry)`
+
+**Key Point**: This is called for EACH log in the overwrite range (100, 101, 102, ...)
+
+---
+
+### Step 2: Rollback and Append Single Log
+
+**Entry Point**: `HomeRaftLogStore::write_at` (called for EACH index)
+
+**What happens**:
+
+1. **Serialize new log entry**:
+   - Format: `[term][type][data]`
+
+2. **Rollback logstore to before this index**:
+   - Call `m_log_store->rollback(to_store_lsn(index) - 1)`
+   - Example: For index=100, rollback to store_lsn=99
+   - This removes log at index 100 and all after it
+
+3. **Reset durable LSN**:
+   - Set `m_last_durable_lsn = -1`
+   - Will be recalculated on next flush
+
+4. **Append new log at this index**:
+   - Call `m_log_store->append_async(new_data)`
+   - Since we just rolled back to 99, next append is at 100
+
+5. **Validate LSN matches**:
+   - Assert `appended_lsn == index`
+
+6. **Clear cache entries after this index**:
+   - Remove cached entries with LSN > index
+   - Prevents serving stale data
+
+7. **Flush immediately**:
+   - Call `end_of_append_batch(index, 1)`
+   - Ensures new log is durable
+
+**Why rollback before each write_at?**
+- First write_at(100): Removes 100-105, writes new 100
+- Second write_at(101): Removes 101 onwards (already gone), writes new 101
+- Ensures each new log is written cleanly
+
+---
+
+### Step 3: Rollback In-Memory State
+
+**Entry Point**: `HomeLogStore::rollback`
+
+**What happens**:
+
+1. **Rollback next LSN**:
+   - Set `m_next_lsn = upto_lsn + 1`
+   - Example: rollback(99) sets m_next_lsn = 100
+
+2. **Truncate in-memory records**:
+   - Call `m_records.truncate(upto_lsn)`
+   - Removes all records with `seq_num > upto_lsn`
+
+3. **Update tail LSN** (if needed):
+   - Set `m_tail_lsn = upto_lsn`
+
+**Key Point - No Physical Truncation**:
+- Does NOT call `m_logdev->truncate()`
+- Old logs remain on disk but are logically invalid
+- Will be overwritten when new logs are appended
+- Physical space reclaimed during next compact
+
+**Why no physical truncation?**
+1. **Efficiency**: Rollback is rare, no need for expensive truncation
+2. **Simplicity**: Just overwrite in-place
+3. **Safety**: Old data remains until overwritten
+
+---
+
+### Complete Rollback Flow Example
+
+```
+Initial State (Old Leader):
+   RAFT LSN:     97  98  99  100 101 102 103 104 105
+   Committed:    ✓   ✓   ✓   ✗   ✗   ✗   ✗   ✗   ✗
+
+Leader Change → New Leader has different logs at 100-102
+
+Phase 1 - NuRaft Rollback (Reverse Order):
+   rollback_ext(105) → Notify app to undo state for 105
+   rollback_ext(104) → Notify app to undo state for 104
+   rollback_ext(103) → Notify app to undo state for 103
+   rollback_ext(102) → Notify app to undo state for 102
+   rollback_ext(101) → Notify app to undo state for 101
+   rollback_ext(100) → Notify app to undo state for 100
+
+Phase 2 - NuRaft Overwrite (Forward Order):
+   write_at(100, new_entry_100):
+      → rollback(99) removes 100-105 from logstore
+      → append new log 100
+      → flush
+
+   write_at(101, new_entry_101):
+      → rollback(100) removes 101 onwards (already gone)
+      → append new log 101
+      → flush
+
+   write_at(102, new_entry_102):
+      → rollback(101) removes 102 onwards (already gone)
+      → append new log 102
+      → flush
+
+Final State:
+   RAFT LSN:     97  98  99  100 101 102
+   Committed:    ✓   ✓   ✓   ✗   ✗   ✗   (will commit later)
+
+   Logs 100-102 now contain NEW data from new leader
+   Old logs 100-105 are orphaned on disk
+```
+
+---
+
+### Rollback vs Compact
+
+| Aspect | Rollback | Compact |
+|--------|----------|---------|
+| **When** | Leader change, uncommitted logs | After snapshot, committed logs |
+| **Scope** | From rollback point to end | From start to compact point |
+| **Physical** | No disk truncation | Physical disk truncation |
+| **Data** | Overwritten in-place | Reclaimed via truncate |
+| **Frequency** | Rare (only on failures) | Regular (after snapshots) |
+
+---
+
+## Recovery Flow
+
+Recovery rebuilds the in-memory state of RAFT logs from persistent storage after restart.
+
+### Overview
+
+```
+Restart → LogDev::do_load() → Read logs from disk →
+on_log_found callbacks → Rebuild logstore mappings →
+RAFT replay → Rebuild state machine
+```
+
+---
+
+### Step 1: Load LogDev Metadata
+
+**Entry Point**: `LogDev::start` (when format=false)
+
+**What happens**:
+1. Read logdev superblock from meta service:
+   ```cpp
+   struct logdev_superblk {
+       start_dev_offset: where to start reading logs
+       key_idx: log_idx to start from
+       num_stores: how many logstores on this logdev
+       ...
+   }
+   ```
+
+2. Read logstore superblocks for each store:
+   ```cpp
+   struct logstore_superblk {
+       m_first_seq_num: first valid LSN for this logstore
+   }
+   ```
+
+3. Initialize LogDev state:
+   - Set `m_log_idx = key_idx` (resume log index sequence)
+   - Prepare to read from `start_dev_offset`
+
+**Output**: LogDev knows where to start reading
+
+---
+
+### Step 2: Scan and Parse Logs
+
+**Entry Point**: `LogDev::do_load`
+
+**What happens**:
+
+1. **Create log stream reader**:
+   - Start reading from `start_dev_offset`
+   - Uses `log_stream_reader` to parse LogGroups
+
+2. **Read LogGroup by LogGroup**:
+   ```cpp
+   while not end of device:
+       LogGroup = read_next_log_group()
+       validate_header(LogGroup.header)
+       for each record in LogGroup:
+           dispatch to correct logstore
+   ```
+
+3. **Validate LogGroup**:
+   - Check magic number: `LOG_GROUP_HDR_MAGIC`
+   - Verify CRC matches
+   - Check prev_grp_crc links to last group
+
+4. **Extract log records**:
+   - Parse each `serialized_log_record` from LogGroup
+   - Structure:
+     ```cpp
+     serialized_log_record {
+         size: size of log data
+         offset: offset within LogGroup
+         is_inlined: whether data is inline or OOB
+         store_seq_num: logstore LSN
+         store_id: which logstore this belongs to
+     }
+     ```
+
+5. **Call logstore callback**:
+   - For each record: `on_logfound(store_id, seq_num, logdev_key, flush_ld_key, data, ...)`
+   - Passes to correct HomeLogStore based on store_id
+
+**Key Points**:
+- Reads sequentially from start_dev_offset to end
+- Stops at first invalid LogGroup (end of valid logs)
+- Handles inline vs OOB data within LogGroup
+
+**Output**: All log records dispatched to logstores
+
+---
+
+### Step 3: Rebuild HomeLogStore State
+
+**Entry Point**: `HomeLogStore::on_log_found`
+
+**What happens**:
+
+1. **Filter by start LSN**:
+   - If `seq_num < m_start_lsn`: skip this record
+   - `m_start_lsn` comes from logstore superblock (after truncation)
+
+2. **Update tail LSN**:
+   - If `seq_num > m_tail_lsn`: update `m_tail_lsn = seq_num`
+   - Track highest LSN seen
+
+3. **Determine truncation key**:
+   ```cpp
+   if seq_num > m_tail_lsn:
+       trunc_key = flush_ld_key  // This is new tail
+   else:
+       trunc_key = m_records.at(m_tail_lsn).m_trunc_key  // Inherit from current tail
+   ```
+
+4. **Create logstore record**:
+   - Build in-memory record:
+     ```cpp
+     logstore_record {
+         m_dev_key: logdev_key (where to read this log)
+         m_trunc_key: logdev_key (truncation boundary)
+     }
+     ```
+   - Insert into `m_records` at seq_num
+
+5. **Update next LSN**:
+   - Set `m_next_lsn = max(m_next_lsn, seq_num + 1)`
+   - Ensures next append doesn't conflict
+
+6. **Call user callback** (if registered):
+   - User-provided `log_found_cb(seq_num, data, context)`
+   - Allows upper layer to examine logs during recovery
+
+**Output**: HomeLogStore has complete `seq_num → logdev_key` mapping
+
+---
+
+
+### Recovery Example
+
+**Persistent State**:
+```
+LogDev Superblock:
+   start_dev_offset: 1024
+   key_idx: 50
+
+LogStore Superblock:
+   m_first_seq_num: 100 (after last truncation)
+
+Journal Device:
+   [Offset 1024]: LogGroup containing records 100-105
+   [Offset 2048]: LogGroup containing records 106-110
+   ...
+```
+
+**Recovery Steps**:
+1. Read from offset 1024
+2. Parse LogGroups
+3. Extract records 100-110 (skip <100)
+4. Rebuild mapping: 100→{idx:50, off:1024}, 101→{idx:51, off:1056}, ...
+5. RAFT replays logs 100-110
+6. Resume at LSN 111
+
+---
+
+## Summary
+
+### Append Flow
+**Purpose**: Persist new write operations to disk
+**Key Steps**:
+1. Leader writes data to data service
+2. Leader proposes to RAFT via `raft_server()->append_entries_ext()`
+3. RAFT replicates to all members via `append()`
+4. All members persist to LogStore (in-memory)
+5. LogDev batches records into LogGroup
+6. `end_of_append_batch` triggers flush to disk
+7. LogDev physically writes to journal device
+
+**Critical Path**: Data must be written before log flush (for followers)
+
+---
+
+### Compact/Truncate Flow
+**Purpose**: Reclaim space from old snapshotted logs
+**Key Steps**:
+1. RAFT creates snapshot at LSN X
+2. Compute `effective_compact_lsn = min(X, truncation_upper_limit)`
+3. Truncate LogStore in-memory (update start_lsn)
+4. LogDev finds minimum truncation point across all stores
+5. Physically truncate journal device
+6. Persist compact_lsn in checkpoint
+
+**Key Constraint**: `truncation_upper_limit` ensures no follower is left behind
+
+---
+
+### Rollback Flow
+**Purpose**: Resolve log conflicts during leader changes
+**Key Steps**:
+1. New leader sends `write_at(index, new_entry)`
+2. Rollback LogStore to index-1 (in-memory only)
+3. Append new log at same index
+4. Immediately flush to disk
+5. Old logs orphaned, reclaimed during next compact
+
+**Key Point**: No physical truncation, just in-memory rollback + overwrite
+
+---
+
+### Recovery Flow
+**Purpose**: Rebuild state after restart
+**Key Steps**:
+1. Read LogDev metadata (start_dev_offset, start_lsn)
+2. Scan LogGroups from start_dev_offset
+3. Dispatch records to logstores via callbacks
+4. Rebuild `seq_num → logdev_key` mapping
+5. RAFT replays logs from snapshot_lsn + 1
+6. Resume normal operation
+
+**Key Invariant**: `logstore_lsn = raft_lsn - 1`
+
+---
diff --git a/docs/structures/raft_repl_dev_log_dev.md b/docs/structures/raft_repl_dev_log_dev.md
new file mode 100644
index 000000000..321140b03
--- /dev/null
+++ b/docs/structures/raft_repl_dev_log_dev.md
@@ -0,0 +1,532 @@
+# RaftReplDev and Log Storage Architecture
+
+This document explains the full layered architecture of HomeStore's log storage subsystem, and how `RaftReplDev` uses it. It covers the relationship between **Log VDev**, **LogDev**, **LogStore**, and **RaftReplDev**, as well as how multiple Placement Groups (PGs) share chunks on the Log VDev.
+
+---
+
+## 1. Layered Architecture Overview
+
+```
+┌────────────────────────────────────────────────────────────────────┐
+│  RaftReplDev  (one per Raft replication group / PG)                │
+│   m_data_journal    : ReplLogStore                                 │
+│   m_free_blks_journal: HomeLogStore  (timeline consistency only)   │
+├────────────────────────────────────────────────────────────────────┤
+│  ReplLogStore : HomeRaftLogStore : nuraft::log_store               │
+│   Adapter between the NuRaft log_store interface and HomeStore     │
+├────────────────────────────────────────────────────────────────────┤
+│  HomeLogStore   (one logical LSN namespace, one logstore_id)       │
+│   Multiple HomeLogStores can share a single LogDev                 │
+├────────────────────────────────────────────────────────────────────┤
+│  LogDev   (one per PG, one logdev_id)                              │
+│   Group-commit engine, flush scheduler, multi-store manager        │
+├────────────────────────────────────────────────────────────────────┤
+│  JournalVirtualDev  (singleton, shared by ALL LogDevs)             │
+│  JournalVirtualDev::Descriptor  (one per LogDev, exclusive window) │
+│  Physical disk chunks  (dynamically allocated from global pool)    │
+└────────────────────────────────────────────────────────────────────┘
+```
+
+### Key files
+
+| Layer | Files |
+|-------|-------|
+| Log VDev | `src/lib/device/journal_vdev.hpp`, `src/lib/device/journal_vdev.cpp` |
+| LogDev | `src/lib/logstore/log_dev.hpp`, `src/lib/logstore/log_dev.cpp` |
+| LogStoreService | `src/include/homestore/logstore_service.hpp`, `src/lib/logstore/log_store_service.cpp` |
+| HomeLogStore | `src/include/homestore/logstore/log_store.hpp`, `src/lib/logstore/log_store.cpp` |
+| HomeRaftLogStore | `src/lib/replication/log_store/home_raft_log_store.h`, `.cpp` |
+| ReplLogStore | `src/lib/replication/log_store/repl_log_store.h`, `.cpp` |
+| RaftReplDev | `src/lib/replication/repl_dev/raft_repl_dev.h`, `.cpp` |
+
+---
+
+## 2. Layer-by-Layer Description
+
+### 2.1 JournalVirtualDev (Log VDev)
+
+`JournalVirtualDev` (`src/lib/device/journal_vdev.hpp`) is the **raw storage backend** for all log data. There is exactly **one** `JournalVirtualDev` instance per HomeStore process, held by `LogStoreService::m_logdev_vdev` and shared by all LogDevs.
+
+It manages a **global chunk pool** (`m_chunk_pool`). Chunks are dynamically allocated from this pool on demand and returned to it when freed by truncation.
+
+Each LogDev gets its own private **Descriptor** — a sliding-window view into the VDev:
+
+```cpp
+// src/lib/device/journal_vdev.hpp
+struct JournalChunkPrivate {
+    logdev_id_t logdev_id{0};   // which LogDev owns this chunk
+    bool is_head{false};        // is it the head of the chain?
+    uint64_t created_at{0};     // creation timestamp (for crash recovery)
+    uint64_t end_of_chunk{0};   // where valid data ends in this chunk
+    chunk_num_t next_chunk{0};  // next chunk in this LogDev's chain
+};
+
+struct Descriptor {
+    logdev_id_t m_logdev_id;
+    off_t m_data_start_offset;              // left boundary (advances on truncate)
+    off_t m_end_offset;                     // right boundary (advances on new chunk)
+    std::vector<shared<Chunk>> m_journal_chunks; // ordered chunk chain
+    std::atomic<uint64_t> m_write_sz_in_total;
+    uint64_t m_reserved_sz;                 // space reserved but not yet written
+    uint64_t m_total_size;
+};
+```
+
+**All offsets are monotonically increasing — they never wrap around.**
+
+`tail_offset() = m_data_start_offset + m_write_sz_in_total + m_reserved_sz`
+
+A **critical invariant**: a single LogGroup write must be smaller than one chunk size. Writes are therefore guaranteed not to span chunk boundaries:
+```cpp
+// journal_vdev.cpp:259
+RELEASE_ASSERT_LT(sz, chunk_size, "Size requested greater than chunk size");
+```
+
+### 2.2 LogDev
+
+`LogDev` (`src/lib/logstore/log_dev.hpp`) is a logical "log band" with its own `logdev_id`. Each PG owns one exclusive LogDev.
+
+Key responsibilities:
+- **Group commit**: batches multiple `log_record`s into a `LogGroup` and writes them atomically. At most 2 `LogGroup` objects exist simultaneously (`max_log_group = 2`).
+- **Manages multiple HomeLogStores**: a LogDev can contain several stores, each with a distinct `logstore_id`.
+- **Flush scheduling**: supports three modes — `INLINE`, `TIMER`, and `EXPLICIT` (Raft always uses `EXPLICIT`).
+- **Owns a monotonic global log index** (`m_log_idx`) used to determine truncation safety.
+
+```cpp
+// log_dev.hpp (key members)
+std::atomic<logid_t> m_log_idx{0};            // ever-increasing log index
+std::shared_ptr<JournalVirtualDev> m_vdev;
+shared<JournalVirtualDev::Descriptor> m_vdev_jd; // exclusive descriptor
+std::unordered_map<logstore_id_t, logstore_info> m_id_logstore_map;
+LogGroup m_log_group_pool[max_log_group];      // pool of 2 log groups
+flush_mode_t m_flush_mode;
+```
+
+### 2.3 HomeLogStore
+
+`HomeLogStore` (`src/include/homestore/logstore/log_store.hpp`) is an independent logical LSN namespace. Multiple stores can coexist within one LogDev.
+
+Each store holds:
+- `m_store_id`: its unique ID within the LogDev
+- `m_logdev`: back-pointer to its LogDev
+- `m_records`: a `StreamTracker` mapping `logstore_seq_num → logdev_key{idx, dev_offset}`
+- `m_start_lsn`, `m_next_lsn`, `m_tail_lsn`
+
+Write path:
+```cpp
+// log_store.cpp:64
+auto ret = m_logdev->append_async(m_store_id, req->seq_num, req->data, ...);
+```
+
+Read path:
+```cpp
+// log_store.cpp:132-143
+const auto record = m_records.at(seq_num);
+const logdev_key ld_key = record.m_dev_key;
+const auto b = m_logdev->read(ld_key);
+```
+
+### 2.4 LogStoreService
+
+`LogStoreService` (`src/include/homestore/logstore_service.hpp`) is the global manager (singleton via `logstore_service()`). It holds the one `JournalVirtualDev` and a map of `logdev_id → LogDev`.
+
+---
+
+## 3. RaftReplDev and the Log Storage Stack
+
+### 3.1 Ownership Structure
+
+```
+LogStoreService (singleton)
+  ├── m_logdev_vdev : JournalVirtualDev         ← one global VDev
+  └── m_id_logdev_map : { logdev_id → LogDev }
+        └── LogDev  (one per PG)
+              ├── m_vdev_jd : Descriptor          ← exclusive VDev window
+              ├── m_id_logstore_map:
+              │     ├── HomeLogStore  (data_journal,  append_mode=true)
+              │     └── HomeLogStore  (free_blks_journal, append_mode=false, optional)
+              └── m_logdev_meta : superblock (persists logstore_id list)
+
+RaftReplDev  (one per Raft group)
+  ├── m_data_journal : ReplLogStore
+  │     └── HomeRaftLogStore
+  │           └── m_log_store : HomeLogStore      ← same object as above
+  └── m_free_blks_journal : HomeLogStore (optional, for timeline consistency)
+
+m_rd_sb persists: { logdev_id, logstore_id }     ← used for recovery
+```
+
+### 3.2 Creation (New PG)
+
+```cpp
+// raft_repl_dev.cpp:71-86  (new RaftReplDev, load_existing=false)
+m_data_journal = std::make_shared<ReplLogStore>(*this, *m_state_machine);
+m_rd_sb->logdev_id   = m_data_journal->logdev_id();
+m_rd_sb->logstore_id = m_data_journal->logstore_id();
+
+// If timeline_consistent:
+m_free_blks_journal = logstore_service().create_new_log_store(
+    m_rd_sb->logdev_id, false /* append_mode */);
+m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id();
+```
+
+Inside `HomeRaftLogStore` constructor:
+```cpp
+// home_raft_log_store.cpp:94-98
+// logstore_id == UINT32_MAX means brand-new
+m_logdev_id  = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+m_log_store  = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */);
+m_logstore_id = m_log_store->get_store_id();
+```
+
+Each Raft group gets its **own exclusive LogDev** with `flush_mode = EXPLICIT`.
+
+### 3.3 Recovery (Existing PG)
+
+```cpp
+// raft_repl_dev.cpp:42-49
+m_data_journal = std::make_shared<ReplLogStore>(
+    *this, *m_state_machine,
+    m_rd_sb->logdev_id, m_rd_sb->logstore_id,
+    [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(...); },
+    [this](auto hs, auto lsn) { m_log_store_replay_done = true; ... });
+```
+
+Inside `HomeRaftLogStore` constructor:
+```cpp
+// home_raft_log_store.cpp:100-113
+m_logdev_id   = logdev_id;
+m_logstore_id = logstore_id;
+logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT);
+m_log_store_future = logstore_service()
+    .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb)
+    .thenValue([this](auto log_store) { m_log_store = std::move(log_store); });
+```
+
+### 3.4 LSN Offset Convention
+
+Raft LSNs start from 1; HomeLogStore `seq_num` starts from 0. They differ by exactly 1:
+
+```cpp
+// home_raft_log_store.cpp:40-44
+static constexpr logstore_seq_num_t to_store_lsn(uint64_t raft_lsn) {
+    return static_cast<logstore_seq_num_t>(raft_lsn - 1);
+}
+// home_raft_log_store.h:258
+static constexpr repl_lsn_t to_repl_lsn(store_lsn_t store_lsn) { return store_lsn + 1; }
+```
+
+### 3.5 Write Path (End to End)
+
+```
+NuRaft::append_entries()
+  → ReplLogStore::append()               [repl_log_store.cpp:9]
+      → m_sm.localize_journal_entry_finish()   // translate remote data pointers to local
+      → HomeRaftLogStore::append()        [home_raft_log_store.cpp:164]
+          → m_log_store->append_async()   // HomeLogStore (auto-assigns next LSN)
+              → LogDev::append_async()    // assigns logid, stores to StreamTracker
+
+  → ReplLogStore::end_of_append_batch()  [repl_log_store.cpp:41]
+      → m_rd.notify_after_data_written() // wait for block data to be persisted
+      → HomeRaftLogStore::end_of_append_batch()
+          → m_log_store->flush()
+              → LogDev::flush_under_guard()
+                  → LogDev::flush()
+                      → prepare_flush()   // pack pending log_records into LogGroup
+                      → m_vdev_jd->alloc_next_append_blk()  // may trigger append_chunk()
+                      → m_vdev_jd->sync_pwritev()            // write to physical disk
+                      → on_flush_completion()                 // notify HomeLogStore callbacks
+```
+
+---
+
+## 4. How 10 PGs Share 100 Chunks
+
+Assume: 1 `JournalVirtualDev` with 100 chunks (e.g., 256 MB each), 10 PGs.
+
+### 4.1 Chunk Allocation Principle
+
+**Chunks are NOT pre-partitioned across PGs.** All 100 chunks start in the global `ChunkPool`. Each LogDev starts with zero chunks and acquires them on demand.
+
+When a flush needs space and `tail_offset + write_size >= m_end_offset`:
+
+```cpp
+// journal_vdev.cpp:261-277  alloc_next_append_blk()
+if ((tail_offset() + sz) >= m_end_offset) {
+    append_chunk();   // dequeue one chunk from the global pool
+}
+```
+
+```cpp
+// journal_vdev.cpp:208-253  append_chunk()
+auto new_chunk = m_vdev.m_chunk_pool->dequeue();  // take from global pool
+m_total_size += new_chunk->size();
+m_end_offset += new_chunk->size();
+
+if (m_journal_chunks.empty()) {
+    // First chunk: mark as head, stamp logdev_id
+    new_chunk_private->is_head    = true;
+    new_chunk_private->logdev_id  = m_logdev_id;
+    new_chunk_private->end_of_chunk = chunk_size;
+} else {
+    // Subsequent chunk: link previous chunk's next_chunk pointer to this one
+    // and record the actual end_of_chunk in the previous chunk
+    last_chunk_private->next_chunk = new_chunk->chunk_id();
+    last_chunk_private->end_of_chunk = offset_in_chunk;   // watermark
+    m_vdev.update_chunk_private(last_chunk, ...);          // persist metadata
+}
+m_journal_chunks.push_back(new_chunk);
+m_vdev.update_chunk_private(new_chunk, ...);
+```
+
+The chunk's private metadata is **persisted to disk immediately** each time ownership changes, enabling crash-safe recovery.
+
+### 4.2 Chunk Exclusivity
+
+A chunk belongs to exactly one entity at any moment:
+
+| State | Owner |
+|-------|-------|
+| In `ChunkPool` | No LogDev — free for any |
+| In a `Descriptor::m_journal_chunks` | That LogDev exclusively |
+
+There is no sharing of chunks between LogDevs. The `logdev_id` field in `JournalChunkPrivate` records current ownership.
+
+### 4.3 Physical Layout on Disk
+
+Chunks from different LogDevs are **physically interleaved** on disk in no particular order. The logical ordering within a LogDev is maintained solely through the `next_chunk` linked-list pointers stored in each chunk's private metadata.
+
+Example snapshot (chunks from 10 PGs scattered on disk):
+
+```
+Physical disk:
+  chunk[0]  → ChunkPool (free)
+  chunk[1]  → PG_2 (head, logdev=2, next=55)
+  chunk[2]  → ChunkPool (free)
+  chunk[3]  → PG_0 (head, logdev=0, next=17)
+  chunk[4]  → ChunkPool (free)
+  ...
+  chunk[17] → PG_0 (logdev=0, next=42)
+  chunk[23] → PG_1 (head, logdev=1, next=0, currently writing)
+  chunk[42] → PG_0 (logdev=0, next=0, currently writing)
+  chunk[55] → PG_2 (logdev=2, next=78)
+  chunk[78] → PG_2 (logdev=2, next=0, currently writing)
+  ...
+  remaining → ChunkPool (free)
+```
+
+Each LogDev's virtual offset space is independent:
+
+```
+PG_0 Descriptor (virtual tape):
+  ├── [0         .. chunk_size)    chunk[3]  (head)
+  ├── [chunk_size .. 2*chunk_size) chunk[17]
+  └── [2*chunk_sz .. 3*chunk_size) chunk[42] ← m_end_offset
+
+  m_data_start_offset = 0          (no truncation yet)
+  tail_offset()       ≈ 2*chunk_sz + bytes_written_in_chunk[42]
+```
+
+### 4.4 Flush Principle
+
+Flush modes (set per LogDev):
+
+| Mode | Trigger | Used by |
+|------|---------|---------|
+| `EXPLICIT` | Caller invokes `flush_under_guard()` | Raft (`end_of_append_batch`) |
+| `INLINE` | After `append_async`, if `pending_size >= threshold` | Non-Raft stores |
+| `TIMER` | Periodic timer checks if there is pending unflushed data | Non-Raft stores |
+
+The `LogGroup` header carries a CRC and `prev_grp_crc` chain for integrity validation during recovery.
+
+**Write alignment constraint**: each `LogGroup` flush must fit within a single chunk:
+```cpp
+RELEASE_ASSERT_LT(sz, chunk_size, "Size requested greater than chunk size");
+```
+If the remaining space in the current chunk cannot fit the next `LogGroup`, a new chunk is obtained first, then the write proceeds.
+
+### 4.5 Truncate Principle
+
+Truncation flows through all layers:
+
+#### Step 1 — HomeLogStore layer (in-memory)
+```cpp
+// log_store.cpp:208-258
+bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_only) {
+    m_records.truncate(upto_lsn);      // release in-memory StreamTracker entries
+    m_start_lsn.store(upto_lsn + 1);
+    if (!in_memory_only) m_logdev->truncate();
+}
+```
+
+#### Step 2 — LogDev layer (find safe truncation point)
+```cpp
+// log_dev.cpp:631-665  LogDev::truncate()
+logdev_key min_safe_ld_key = logdev_key::out_of_bound_ld_key();
+for (auto& [store_id, store] : m_id_logstore_map) {
+    auto [trunc_lsn, trunc_ld_key, tail_lsn] = store.log_store->truncate_info();
+    if (trunc_ld_key.idx < min_safe_ld_key.idx) {
+        min_safe_ld_key = trunc_ld_key;   // take the most conservative point
+    }
+}
+// All stores (data_journal AND free_blks_journal) must agree before truncating.
+m_vdev_jd->truncate(min_safe_ld_key.dev_offset);
+m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, ...);
+```
+
+#### Step 3 — Descriptor layer (release chunks)
+```cpp
+// journal_vdev.cpp:569-687  Descriptor::truncate(truncate_offset)
+// 1. Find which chunk contains truncate_offset → this becomes the new head.
+// 2. Persist is_head=true on the new head chunk (crash-safe: two heads may
+//    exist transiently; recovery picks the one with higher created_at).
+// 3. Walk the chunk list from the old head:
+for (auto it = m_journal_chunks.begin(); ...) {
+    if (cover_offset <= truncate_offset) {
+        m_total_size -= chunk->size();
+        m_journal_chunks.erase(it);
+        m_vdev.release_chunk_to_pool(chunk);   // return to global pool
+    }
+}
+// 4. Update m_data_start_offset = truncate_offset.
+// 5. Reduce m_write_sz_in_total by size_to_truncate.
+```
+
+```cpp
+// journal_vdev.cpp:156-166  release_chunk_to_pool()
+*data = JournalChunkPrivate{};          // clear logdev_id and all metadata
+update_chunk_private(chunk, data);      // persist the clear (crash-safe)
+m_chunk_pool->enqueue(chunk);           // chunk is now available to any LogDev
+```
+
+**A chunk is released only when the truncation offset completely covers it.** The chunk containing the new `m_data_start_offset` is retained.
+
+### 4.6 Recovery: Reconstructing Chunk Chains
+
+At startup, `JournalVirtualDev::init()` reconstructs each LogDev's chunk chain from persisted metadata:
+
+```cpp
+// journal_vdev.cpp:72-138
+void JournalVirtualDev::init() {
+    // 1. Scan all chunks; build: logdev_id → head_chunk, chunk_id → chunk
+    for (auto& [_, chunk] : m_all_chunks) {
+        if (data->is_head) {
+            // Among multiple is_head chunks for the same logdev_id,
+            // pick the one with the highest created_at timestamp.
+            logdev_head_map[logdev_id] = { chunk_id, created_at };
+        }
+    }
+
+    // 2. For each logdev, follow next_chunk pointers to rebuild the ordered list.
+    for (auto& [logdev_id, head] : logdev_head_map) {
+        auto journal_desc = std::make_shared<Descriptor>(*this, logdev_id);
+        auto chunk_num = head.chunk_num;
+        while (chunk_num != 0) {
+            journal_desc->m_journal_chunks.push_back(chunk_map[chunk_num]);
+            chunk_num = data->next_chunk;
+        }
+    }
+
+    // 3. Chunks not in any chain are orphans — remove them.
+    for (auto& [_, chunk] : m_all_chunks) {
+        if (!visited_chunks.count(chunk->chunk_id())) {
+            remove_journal_chunks(orphan_chunks);
+        }
+    }
+}
+```
+
+The **crash-safe two-head mechanism**: when truncation persists `is_head=true` on the new head before releasing old chunks, a crash leaves two chunks with `is_head=true`. Recovery resolves this by selecting the one with the **higher `created_at`** timestamp, treating the other as orphaned.
+
+---
+
+## 5. Log VDev Capacity and Chunk Allocation Internals
+
+This section explains why `LogStoreService::create_vdev()` passes `vdev_size=0` and `num_chunks=0` for the Log VDev, how the system still knows the allocatable capacity, and where the actual allocation logic lives.
+
+### 5.1 Why `vdev_size=0` and `num_chunks=0` still works
+
+The Log VDev (`JournalVirtualDev`) is created with a *dynamic size* configuration (see `src/lib/logstore/log_store_service.cpp`, `LogStoreService::create_vdev()`): it does **not** pre-create a fixed number of chunks at format time.
+
+Instead, it relies on an internal **ChunkPool** which can *create* new chunks on demand. The real upper bound is determined by the **physical device data region** and by available chunk-id / chunk-info slots.
+
+### 5.2 Where chunks come from: `ChunkPool` producer thread
+
+`JournalVirtualDev` constructs a `ChunkPool` (`src/lib/device/journal_vdev.cpp:47-57`). The pool maintains a background producer thread which creates chunks via `DeviceManager::create_chunk()` when the pool drops below a threshold.
+
+Key code path:
+
+- `ChunkPool::producer()` (`src/lib/device/device_manager.cpp:873-899`)
+  - calls `DeviceManager::create_chunk(dev_type, vdev_id, chunk_size, private_data)`
+
+### 5.3 Global allocation constraints
+
+A new chunk can be created only if all of the following remain available:
+
+1. **Global chunk-id space** (DeviceManager-level)
+   - `DeviceManager::create_chunk()` reserves a new `chunk_id` from `m_chunk_id_bm`.
+   - If there are no remaining IDs, it throws: `"System has no room for additional chunk"`.
+
+2. **Per-PDev chunk-info slots** (PhysicalDev-level)
+   - `PhysicalDev::create_chunk()` allocates a free *slot* in `m_chunk_info_slots`.
+   - If there are no remaining slots, it throws: `"System has no room for additional chunk"`.
+
+3. **Per-PDev free space in the data region** (PhysicalDev-level)
+   - This is enforced by `PhysicalDev::find_next_chunk_area()`.
+
+### 5.4 The physical device data region: `data_start_offset()` and `data_end_offset()`
+
+Chunk data cannot be placed at arbitrary offsets on a physical device. It must be placed inside the device's **data region**, which excludes the HomeStore superblocks/metadata areas.
+
+- The data start offset is computed during formatting in `DeviceManager::populate_pdev_info()` (`src/lib/device/device_manager.cpp:708-713`):
+  - `pinfo.data_offset = round_up(first_block_offset + total_superblock_size, phys_page_size)`
+
+- PhysicalDev exposes:
+  - `data_start_offset() = m_pdev_info.data_offset` (`src/lib/device/physical_dev.hpp:226`)
+  - `data_end_offset()` is the end of usable device space, adjusted if the superblock is mirrored in the footer (`src/lib/device/physical_dev.hpp:227-229`).
+
+Because chunk data must not overlap the superblock areas, chunk allocation searches only inside `[data_start_offset, data_end_offset)`.
+
+### 5.5 How free space is tracked: `m_chunk_data_area`
+
+Each `PhysicalDev` maintains an in-memory interval set of allocated chunk data ranges:
+
+- `PhysicalDev::m_chunk_data_area` (`src/lib/device/physical_dev.hpp:138`)
+
+It is populated in two ways:
+
+1. **Recovery path**: as existing chunks are loaded from the superblock, their occupied ranges are inserted
+   - `PhysicalDev::load_chunks()` inserts `[chunk_start_offset, chunk_start_offset + chunk_size)` into `m_chunk_data_area` (`src/lib/device/physical_dev.cpp:400-403`).
+
+2. **Creation path**: when a new chunk is created, the newly selected range is inserted
+   - `PhysicalDev::populate_chunk_info()` inserts the chosen interval (`src/lib/device/physical_dev.cpp:448-453`).
+
+Chunk deletion removes the corresponding range:
+- `PhysicalDev::free_chunk_info()` erases the interval (`src/lib/device/physical_dev.cpp:464-467`).
+
+### 5.6 The actual placement algorithm: `find_next_chunk_area()`
+
+The physical placement decision is made by `PhysicalDev::find_next_chunk_area(size)` (`src/lib/device/physical_dev.cpp:474-485`). It performs a simple first-fit scan:
+
+- Start with the candidate interval `[data_start_offset, data_start_offset + size)`.
+- If it overlaps an existing allocated interval, move the candidate to start at `exist_ival.upper()`.
+- If the candidate's upper bound exceeds `data_end_offset`, allocation fails with `"Physical dev has no room for additional chunk"`.
+
+This means **chunks from different VDevs can be physically interleaved on the same physical device**, because all VDevs allocate from the same per-PDev data region and are only separated by chunk ownership metadata (`chunk_info.vdev_id`).
+
+---
+
+## 6. Summary of Key Properties
+
+| Property | Behavior |
+|----------|----------|
+| Chunk allocation | Demand-driven from global pool; no pre-partitioning |
+| Chunk exclusivity | A chunk belongs to exactly one LogDev at any time |
+| Physical layout | Chunks from different LogDevs are interleaved on disk |
+| Offset space | Per-LogDev, monotonically increasing, never wraps |
+| Write boundary | A single LogGroup flush must fit within one chunk |
+| Flush mode (Raft) | `EXPLICIT` — triggered by `end_of_append_batch` only |
+| Truncation safety | Governed by the minimum `trunc_ld_key` across all LogStores in a LogDev |
+| Chunk release | Only when truncation fully covers the chunk |
+| Recovery | Per-chunk metadata (`logdev_id`, `next_chunk`, `is_head`, `created_at`) |
+| Crash safety | New head persisted before old chunks released; two-head resolved by `created_at` |
diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp
index 3e4dda2a0..2c19910f7 100644
--- a/src/lib/device/journal_vdev.cpp
+++ b/src/lib/device/journal_vdev.cpp
@@ -566,7 +566,7 @@ void JournalVirtualDev::Descriptor::update_tail_offset(off_t tail) {
     LOGINFOMOD(journalvdev, "Updated tail offset arg 0x{} desc {} ", to_hex(tail), to_string());
 }
 
-off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset) {
+off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset, logid_t log_idx) {
     const off_t ds_off = data_start_offset();
     COUNTER_INCREMENT(m_vdev.m_metrics, vdev_truncate_count, 1);
     HS_PERIODIC_LOG(DEBUG, journalvdev, "truncating to logical offset: 0x{} desc {}", to_hex(truncate_offset),
@@ -605,6 +605,14 @@ off_t JournalVirtualDev::Descriptor::truncate(off_t truncate_offset) {
     auto* private_data = r_cast< JournalChunkPrivate* >(const_cast< uint8_t* >(new_head_chunk->user_private()));
     private_data->is_head = true;
     private_data->logdev_id = m_logdev_id;
+    if (log_idx != -1) {
+        // Write recovery hint so do_load can handle a crash between this truncation and
+        // the subsequent logdev meta persist. Only written when log_idx is known.
+        private_data->head_version = JournalChunkPrivate::JOURNAL_HEAD_VERSION;
+        private_data->head_magic = JournalChunkPrivate::JOURNAL_HEAD_MAGIC;
+        private_data->head_start_offset = truncate_offset;
+        private_data->head_start_idx = log_idx;
+    }
     m_vdev.update_chunk_private(new_head_chunk, private_data);
 
     // Find all chunks which needs to be removed from the start of m_journal_chunks.
@@ -807,6 +815,11 @@ nlohmann::json JournalVirtualDev::Descriptor::get_status(int log_level) const {
     return j;
 }
 
+const JournalChunkPrivate* JournalVirtualDev::Descriptor::head_chunk_private() const {
+    if (m_journal_chunks.empty()) { return nullptr; }
+    return r_cast< const JournalChunkPrivate* >(m_journal_chunks.front()->user_private());
+}
+
 std::string JournalVirtualDev::Descriptor::to_string() const {
     off_t tail =
         static_cast< off_t >(data_start_offset() + m_write_sz_in_total.load(std::memory_order_relaxed)) + m_reserved_sz;
diff --git a/src/lib/device/journal_vdev.hpp b/src/lib/device/journal_vdev.hpp
index 460db4012..345045b3a 100644
--- a/src/lib/device/journal_vdev.hpp
+++ b/src/lib/device/journal_vdev.hpp
@@ -35,11 +35,21 @@ using journal_id_t = uint64_t;
 // Each log device has a list of journal chunk data with next_chunk.
 // Journal vdev will arrange the chunks in order during recovery.
 struct JournalChunkPrivate {
+    // Fields below are the original on-disk layout (must stay at these offsets).
     logdev_id_t logdev_id{0};
     bool is_head{false};       // Is it the head element.
     uint64_t created_at{0};    // Creation timestamp
     uint64_t end_of_chunk{0};  // The offset indicates end of chunk.
     chunk_num_t next_chunk{0}; // Next chunk in the list.
+
+    // Head chunk recovery hint appended after the original fields.
+    // Only valid when is_head==true and head_magic==JOURNAL_HEAD_MAGIC.
+    static constexpr uint32_t JOURNAL_HEAD_VERSION{1u};
+    static constexpr uint32_t JOURNAL_HEAD_MAGIC{0xBEEFCAFE};
+    uint32_t head_version{0};
+    uint32_t head_magic{0};
+    off_t head_start_offset{0}; // logdev truncate offset persisted with head
+    logid_t head_start_idx{0};  // logdev truncate idx persisted with head
 };
 
 static_assert(sizeof(JournalChunkPrivate) <= chunk_info::user_private_size, "Journal private area bigger");
@@ -286,7 +296,7 @@ class JournalVirtualDev : public VirtualDev {
          *
          * @return : return the new data start offset after truncation.
          */
-        off_t truncate(off_t truncate_offset);
+        off_t truncate(off_t truncate_offset, logid_t log_idx = -1);
 
         /**
          * @brief : get the total size in journal
@@ -332,6 +342,8 @@ class JournalVirtualDev : public VirtualDev {
 
         logdev_id_t logdev_id() const { return m_logdev_id; }
 
+        const JournalChunkPrivate* head_chunk_private() const;
+
         std::string to_string() const;
 
     private:
diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp
index 60d269509..609902a00 100644
--- a/src/lib/logstore/log_dev.cpp
+++ b/src/lib/logstore/log_dev.cpp
@@ -233,6 +233,46 @@ void LogDev::do_load(off_t device_cursor) {
             break;
         }
 
+        if (loaded_from == -1 && header->start_idx() > m_log_idx) {
+            // Corner-case recovery: a SIGKILL landed after vdev chunk truncation (durable) but
+            // before logdev meta persist.  The chunk list reflects the new head; logdev meta
+            // still carries the old (dev_offset, log_idx).  We validate via the recovery hint
+            // written atomically into the head chunk during truncation and then advance m_log_idx
+            // to match the first log group the stream reader actually found.
+            THIS_LOGDEV_LOG(WARN,
+                            "LogDev {} corner-case recovery: header->start_idx()={} > m_log_idx={} "
+                            "(stale meta dev_offset={}); validating head chunk recovery hint",
+                            m_logdev_id, header->start_idx(), m_log_idx.load(), device_cursor);
+
+            auto const* hint = m_vdev_jd->head_chunk_private();
+            HS_REL_ASSERT(hint, "logdev={} corner-case recovery: head chunk hint is missing; possible data corruption",
+                          m_logdev_id);
+            HS_REL_ASSERT(hint->is_head && hint->head_magic == JournalChunkPrivate::JOURNAL_HEAD_MAGIC &&
+                              hint->head_version == JournalChunkPrivate::JOURNAL_HEAD_VERSION,
+                          "logdev={} corner-case recovery: start_idx={} > m_log_idx={} but head chunk hint is invalid "
+                          "(magic=0x{} version={} is_head={}); possible data corruption",
+                          m_logdev_id, header->start_idx(), m_log_idx.load(), to_hex(hint->head_magic),
+                          hint->head_version, hint->is_head);
+            HS_REL_ASSERT_GT(hint->head_start_idx, m_log_idx.load(),
+                             "logdev={} corner-case recovery: hint->head_start_idx={} must be > stale m_log_idx={}",
+                             m_logdev_id, hint->head_start_idx, m_log_idx.load());
+
+            THIS_LOGDEV_LOG(
+                WARN, "LogDev {} corner-case recovery: advancing m_log_idx {} -> {} data_start_offset {} -> {}",
+                m_logdev_id, m_log_idx.load(), hint->head_start_idx, device_cursor, hint->head_start_offset);
+
+            // Scope and known limitations of this recovery:
+            // - This fix is scoped to logstream positioning only. It does NOT replay the full set
+            //   of meta changes that the aborted LogDev::truncate() had in-flight, specifically:
+            //   logstore superblock updates (start LSN), unreserve-store, and rollback-info cleanup
+            //   (log_dev.cpp LogDev::truncate()). Those changes may not have been persisted.
+            m_log_idx.store(hint->head_start_idx, std::memory_order_release);
+            m_vdev_jd->update_data_start_offset(hint->head_start_offset);
+            device_cursor = hint->head_start_offset;
+            do_load(device_cursor);
+            return;
+        }
+
         THIS_LOGDEV_LOG(DEBUG, "Found log group header offset=0x{} header {}", to_hex(group_dev_offset), *header);
         HS_REL_ASSERT_EQ(header->start_idx(), m_log_idx.load(), "log indx is not the expected one");
         if (loaded_from == -1) { loaded_from = header->start_idx(); }
@@ -665,7 +705,21 @@ uint64_t LogDev::truncate() {
     uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx);
 
     // Truncate them in vdev
-    m_vdev_jd->truncate(min_safe_ld_key.dev_offset);
+    m_vdev_jd->truncate(min_safe_ld_key.dev_offset, min_safe_ld_key.idx);
+
+#ifdef _PRERELEASE
+    // Simulate a crash between vdev chunk truncation (durable, sync_write) and logdev meta
+    // persist, reproducing the inconsistency that do_load's defensive start_idx check handles.
+    if (hs()->crash_simulator().crash_if_flip_set("abort_logdev_truncate_before_meta_persist")) {
+        THIS_LOGDEV_LOG(INFO,
+                        "CRASH FLIP fired: vdev truncated to dev_offset={} log_idx={} but meta NOT persisted. "
+                        "Stale meta on disk: dev_offset={} log_idx={}",
+                        min_safe_ld_key.dev_offset, min_safe_ld_key.idx, m_logdev_meta.get_start_dev_offset(),
+                        m_logdev_meta.get_start_log_idx());
+        decr_pending_request_num();
+        return num_records_to_truncate;
+    }
+#endif
 
     // Update the start offset to be read upon restart
     m_last_truncate_idx = min_safe_ld_key.idx;
diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp
index 45ecee96f..c13c6141d 100644
--- a/src/tests/test_log_dev.cpp
+++ b/src/tests/test_log_dev.cpp
@@ -184,6 +184,29 @@ class LogDevTest : public ::testing::Test {
         }
     }
 
+    // Like insert_batch_sync but always allocates via malloc with exactly data_size payload bytes,
+    // giving deterministic log-group sizes needed for chunk-boundary geometry tests.
+    void insert_batch_fixed(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch,
+                            uint32_t data_size) {
+        std::vector< uint8_t* > bufs;
+        bufs.reserve(batch);
+        for (int64_t i = 0; i < batch; ++i) {
+            uint32_t const total = sizeof(test_log_data) + data_size;
+            auto* raw = static_cast< uint8_t* >(std::malloc(total));
+            auto* d = new (raw) test_log_data();
+            d->size = data_size;
+            const char c = static_cast< char >(((lsn + i) % 94) + 33);
+            std::memset(d->get_data(), c, data_size);
+            bufs.push_back(raw);
+            log_store->write_async(lsn + i, {uintptr_cast(raw), total, false}, nullptr, nullptr);
+        }
+        log_store->flush();
+        lsn += batch;
+        for (auto* raw : bufs) {
+            std::free(raw);
+        }
+    }
+
     void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch,
                            uint32_t fixed_size = 0) {
         auto last = cur_lsn + batch;
@@ -797,6 +820,117 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) {
     }
 }
 
+#ifdef _PRERELEASE
+TEST_F(LogDevTest, TruncateChunkMetaInconsistency) {
+    // Reproduces a crash on restart triggered by a SIGKILL arriving between:
+    //   (1) m_vdev_jd->truncate()  — persists new chunk head to disk via sync_write (durable)
+    //   (2) m_logdev_meta.persist() — persists logdev superblock (never reached)
+    //
+    // On restart the chunk list reflects the new head chunk (from (1)) but logdev meta still
+    // holds the old (dev_offset, log_idx) pair (from before (2)). offset_to_chunk() maps the
+    // stale dev_offset against the new chunk list into the new head chunk where higher-indexed
+    // log data resides, causing do_load() to fire HS_REL_ASSERT_EQ(start_idx, m_log_idx).
+    LOGINFO("Step 1: Create a single logdev and logstore");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto log_store = logstore_service().create_new_log_store(logdev_id, false);
+    const auto store_id = log_store->get_store_id();
+
+    // Geometry (all sizes assume flush_size_multiple=4096, chunk_size=8MB=8,388,608B):
+    //
+    // Each 4096B payload record becomes 4100B total (sizeof(test_log_data)=4 + 4096).
+    // 4100 % 4096 = 4 != 0 and !data.is_aligned() => always inlined (no OOB zero-copy).
+    // A batch of 500 records produces 3 log groups:
+    //   LG1 (202rec): round_up(48 + 202x20 + 202x4100 + 24, 4096) = round_up(832,312, 4096) = 835,584B (=204x4096)
+    //   LG2 (202rec): 835,584B
+    //   LG3 ( 96rec): round_up(48 +  96x20 +  96x4100 + 24, 4096) = round_up(395,592, 4096) = 397,312B (= 97x4096)
+    //   Total per batch = 2,068,480B (=505x4096)
+    // We use insert_batch_fixed (always malloc, never iobuf_alloc) to guarantee exact sizes.
+    //
+    // Step 2: 5 batches = 2,500 records (logdev idx 0-2499)
+    //   Batches 1-4: 4 x 2,068,480 = 8,273,920B fills chunk_0 (gap=114,688B to chunk end).
+    //   Batch 5 overflows chunk_0 (114,688B gap fills remainder), continues in chunk_1:
+    //     LG1 at chunk_1[       0] (idx 2000-2201)
+    //     LG2 at chunk_1[ 835,584] (idx 2202-2403)
+    //     LG3 at chunk_1[1,671,168] (idx 2404-2499)
+    //   m_last_flush_ld_key = {2404, X1=10,059,776} (LG3 of batch 5 in chunk_1).
+    //
+    // Step 3 truncation persists X1=10,059,776, Y1=2404 to meta; chunk_0 released, chunk_1 kept.
+    //
+    // Step 4: 4 batches = 2,000 records (logdev idx 2500-4499)
+    //   Batches 1-3 continue in chunk_1 from [2,068,480] to [8,273,920]. Batch 4 crosses into
+    //   chunk_2 (114,688B gap fills remainder of chunk_1). In chunk_2 this batch writes:
+    //     LG1 at chunk_2[       0] (idx 4000-4201)
+    //     LG2 at chunk_2[ 835,584] (idx 4202-4403)
+    //     LG3 at chunk_2[1,671,168] (idx 4404-4499)
+    //   m_last_flush_ld_key = {4404, X2=18,448,384} (LG3 of batch 4 in chunk_2).
+    //
+    // Step 7: second truncation targets X2=18,448,384.  vdev persists chunk_2 as head (durable),
+    //   releases chunk_1, then flip fires before meta.persist().  On disk: chunk_2 is head but
+    //   meta still holds stale (X1, Y1).
+    //
+    // Recovery: update_data_start_offset(X1=10,059,776) with chunk_list=[chunk_2].
+    //   offset_to_chunk(X1=10,059,776):
+    //     chunk_aligned = round_down(10,059,776, 8MB) = 8,388,608
+    //     off_l = 10,059,776 - 8,388,608 = 1,671,168  =>  chunk_2[1,671,168]
+    //   That is exactly the start of LG3 of step-4 batch 4: magic valid, start_idx=4404.
+    //   m_log_idx from stale meta = Y1 = 2404.
+    //   HS_REL_ASSERT_EQ(4404, 2404) fires  =>  crash reproduced.
+    constexpr uint32_t entry_size = 4096;
+    constexpr int64_t batch_size = 500;
+    constexpr int num_batches_step2 = 5;
+
+    LOGINFO("Step 2: Write {} deterministic entries ({}x{}) to fill chunk 0 and land X1 inside chunk 1",
+            batch_size * num_batches_step2, num_batches_step2, batch_size);
+    logstore_seq_num_t cur_lsn = 0;
+    for (int i = 0; i < num_batches_step2; ++i) {
+        insert_batch_fixed(log_store, cur_lsn, batch_size, entry_size);
+    }
+
+    LOGINFO("Step 3: Normal chunk-crossing truncation (no flip) — chunk 0 released, logdev meta persisted");
+    truncate_validate(log_store);
+
+    // Step 4: 4 more batches continue in chunk_1 then cross into chunk_2. The last log group of
+    // step 4 (LG3 of batch 4) lands at chunk_2[1,671,168], matching what offset_to_chunk maps
+    // stale X1=10,059,776 to when only chunk_2 is present after the step-7 crash.
+    constexpr int num_batches_step4 = 4;
+    LOGINFO("Step 4: Write {} more deterministic entries ({}x{}) to fill chunk 1 and allocate chunk 2",
+            batch_size * num_batches_step4, num_batches_step4, batch_size);
+    for (int i = 0; i < num_batches_step4; ++i) {
+        insert_batch_fixed(log_store, cur_lsn, batch_size, entry_size);
+    }
+
+    LOGINFO("Step 5: Register restart callback to re-open logdev and logstore after crash recovery");
+    std::promise< bool > reopen_done;
+    m_helper.change_start_cb([&logdev_id, &store_id, &log_store, &reopen_done]() {
+        logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT);
+        logstore_service()
+            .open_log_store(logdev_id, store_id, false /* append_mode */)
+            .thenValue([&log_store, &reopen_done](auto store) {
+                log_store = store;
+                reopen_done.set_value(true);
+            });
+    });
+
+    LOGINFO("Step 6: Inject flip to simulate SIGKILL after vdev chunk truncation, before logdev meta persist");
+    m_helper.set_basic_flip("abort_logdev_truncate_before_meta_persist");
+
+    LOGINFO("Step 7: Trigger second chunk-crossing truncation to fire the flip");
+    // Pass false so truncate() calls m_logdev->truncate(), where the flip fires:
+    // m_vdev_jd->truncate() completes (sync_write, durable) but m_logdev_meta.persist() is
+    // bypassed. LogDev::stop() skips its healing truncate when is_crashed(), preserving
+    // the on-disk inconsistency for the restart to exercise.
+    log_store->truncate(log_store->get_contiguous_completed_seq_num(-1), false /* in_memory_truncate_only */);
+
+    LOGINFO("Step 8: Wait for crash recovery (restart_homestore running in background thread)");
+    m_helper.wait_for_crash_recovery();
+    reopen_done.get_future().get(); // wait for logstore replay to complete
+
+    LOGINFO("Step 9: Restart succeeded — no do_load assertion fired; log store is valid");
+    ASSERT_NE(log_store, nullptr) << "Log store must be valid after crash recovery";
+}
+#endif
+
 SISL_OPTION_GROUP(test_log_dev,
                   (num_logdevs, "", "num_logdevs", "number of log devs",
                    ::cxxopts::value< uint32_t >()->default_value("4"), "number"),