eBay · xiaoxichen · Jun 24, 2026 · Jun 24, 2026
diff --git a/conanfile.py b/conanfile.py
@@ -10,7 +10,7 @@
 
 class HomeObjectConan(ConanFile):
     name = "homeobject"
-    version = "4.1.19"
+    version = "4.1.20"
 
     homepage = "https://github.com/eBay/HomeObject"
     description = "Blob Store built on HomeStore"
@@ -51,7 +51,7 @@ def build_requirements(self):
 
     def requirements(self):
         self.requires("sisl/[^13.2]", transitive_headers=True)
-        self.requires("homestore/[^7.5.2]")
+        self.requires("homestore/[^7.5.10]")
 
     def validate(self):
         if self.info.settings.compiler.cppstd:

diff --git a/docs/adr/gc-priority-scheduling.md b/docs/adr/gc-priority-scheduling.md
@@ -0,0 +1,93 @@
+# GC Priority Scheduling with Dual Watermarks
+
+**Status**: Accepted  
+**Date**: 2026-06-17  
+
+---
+
+## Context
+
+The GC manager periodically scans all chunks per pdev and submits GC tasks for any chunk whose
+garbage ratio (defragmented blocks / total blocks) exceeds a single threshold
+(`gc_garbage_rate_threshold`, default 50%). Tasks are submitted in arbitrary iteration order until a
+per-pdev quota (`max_task_num = 2 × (reserved_chunks − egc_reserved)`) is reached.
+
+This design has two weaknesses:
+
+1. **No priority ordering.** A chunk at 51% garbage and a chunk at 95% garbage are treated
+   identically. When the quota is exhausted, high-garbage chunks may be skipped in favour of
+   lower-garbage ones that happen to appear first in iteration order.
+
+2. **Single threshold.** The threshold must be set conservatively high (50%) to avoid saturating
+   the quota with marginally-dirty chunks, leaving genuinely dirty chunks under-served.
+
+---
+
+## Decision
+
+### 1. Priority-based scheduling (top-K by garbage ratio, descending)
+
+Before submitting any tasks, `scan_chunks_for_gc` now selects the top-K most garbage-heavy chunks
+per pdev using a bounded min-heap (`std::priority_queue`) of capacity `K = max_task_num`. After
+collection, the heap is drained into a vector and reversed so submission proceeds in descending
+garbage-ratio order — the chunks with the most garbage are always scheduled first. This maximises
+reclaimed space per GC cycle and makes scheduling deterministic: given the same chunk state, a
+leader and its followers will produce an identical submission order.
+
+`K` is fixed at `max_task_num` (`2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc)`)
+rather than the dynamic `remaining_capacity` so we always retain enough candidates if capacity
+opens up later during the scan. Memory is bounded to O(K) regardless of how many chunks the pdev
+holds.
+
+### 2. Dual watermarks
+
+Two configurable thresholds replace the single threshold:
+
+| Config key | Default | Meaning |
+|---|---|---|
+| `gc_garbage_rate_threshold` | 50% | **High watermark.** Chunks above this threshold are *high-tier* and can consume the full task quota. |
+| `gc_garbage_rate_threshold_low` | 30% | **Low watermark.** Chunks between the two watermarks are *low-tier* and are capped at half the task quota (`max_task_num / 2`). |
+
+The submission loop enforces these invariants:
+
+```
+total_quota  = max_task_num          (hard cap, both tiers combined)
+low_quota    = max_task_num / 2      (cap for low-tier chunks only)
+
+for chunk in top_k_chunks iterated DESC by garbage_ratio:
+    if total_submitted >= total_quota  → stop
+    if chunk is low-tier AND low_tier_submitted >= low_quota → skip
+    submit; update counters
+```
+
+Because chunks are iterated in descending order, all high-tier chunks are submitted before any
+low-tier ones, so the low-tier cap never blocks a more-urgent chunk.
+
+Both thresholds are marked `hotswap` and can be adjusted at runtime without restart.
+
+---
+
+## Consequences
+
+### Leader–Follower GC Synchronisation
+
+A key benefit of this change is improved synchronisation between the leader and its followers.
+Under Raft replication, all replicas apply the same writes and deletions, so the garbage ratio of
+any given vchunk will converge to the same value across all replicas. With priority-based
+scheduling, the leader and followers will independently reach the same conclusion about which
+chunks to GC and in what order — they see the same dirty state and rank chunks identically.
+
+This synchronisation matters because emergent GC (eGC) is triggered when a replica runs out of
+free space and must GC under pressure. If regular GC runs consistently on the dirtiest chunks
+first, space is reclaimed more efficiently, and replicas are less likely to diverge in their free
+space inventory. The result is fewer eGC events and lower tail latency for normal I/O.
+
+### Operational impact
+
+- **Lower effective threshold.** Because dirty chunks are prioritised, the high watermark can be
+  reduced over time (e.g., from 50% to 40%) without increasing the number of tasks per cycle.
+- **Coarse throttle for marginally-dirty chunks.** The low-tier cap ensures that chunks just above
+  the low watermark do not crowd out more valuable work when the quota is tight.
+- **Backward compatibility.** The existing `gc_garbage_rate_threshold` field retains its default
+  value (50%) and its role; only its semantics are clarified as the *high* watermark. No on-disk
+  format changes are required.
diff --git a/src/lib/homestore_backend/CMakeLists.txt b/src/lib/homestore_backend/CMakeLists.txt
@@ -155,6 +155,8 @@ add_test(NAME FetchDataWithOriginatorGC
         COMMAND homestore_test_dynamic -csv error --executor immediate --config_path ./
         --override_config hs_backend_config.enable_gc=true
         --override_config hs_backend_config.gc_enable_read_verify=true
+        --override_config hs_backend_config.gc_garbage_rate_threshold=50
+        --override_config hs_backend_config.gc_garbage_rate_threshold_low=30
         --gtest_filter=HomeObjectFixture.FetchDataWithOriginatorGC)
 
 add_executable(homestore_test_gc)
@@ -163,5 +165,6 @@ target_link_libraries(homestore_test_gc PUBLIC homeobject_homestore ${COMMON_TES
 add_test(NAME HomestoreTestGC COMMAND homestore_test_gc -csv error --executor immediate --config_path ./
         --override_config hs_backend_config.enable_gc=true
         --override_config hs_backend_config.gc_enable_read_verify=true
-        --override_config hs_backend_config.gc_garbage_rate_threshold=0 
+        --override_config hs_backend_config.gc_garbage_rate_threshold=0
+        --override_config hs_backend_config.gc_garbage_rate_threshold_low=0
         --override_config hs_backend_config.gc_scan_interval_sec=5)
diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp
@@ -1,3 +1,6 @@
+#include <algorithm>
+#include <queue>
+
 #include <homestore/btree/btree_req.hpp>
 #include <homestore/btree/btree_kv.hpp>
 
@@ -220,64 +223,153 @@ std::shared_ptr< GCManager::pdev_gc_actor > GCManager::get_pdev_gc_actor(uint32_
     return it->second;
 }
 
-bool GCManager::is_eligible_for_gc(chunk_id_t chunk_id) {
+uint32_t GCManager::get_chunk_gc_ratio(chunk_id_t chunk_id) {
     auto chunk = m_chunk_selector->get_extend_vchunk(chunk_id);
+
+    // Only AVAILABLE chunks are eligible: INUSE means an open shard owns it, GC means already being processed.
+    if (chunk->m_state != ChunkState::AVAILABLE) { return 0; }
+
     const auto defrag_blk_num = chunk->get_defrag_nblks();
-    if (!defrag_blk_num) { return false; }
+    if (!defrag_blk_num) { return 0; }
 
-    // 1 if the chunk state is inuse, it is occupied by a open shard, so it can not be selected and we don't need gc it.
-    // 2 if the chunk state is gc, it means this chunk is being gc, or this is a reserved chunk, so we don't need gc it.
-    if (chunk->m_state != ChunkState::AVAILABLE) {
-        LOGDEBUGMOD(gcmgr, "chunk_id={} state is {}, not eligible for gc", chunk_id, chunk->m_state)
-        return false;
-    }
+    // Chunks with no pg assignment are unowned and do not need GC.
+    if (!chunk->m_pg_id.has_value()) { return 0; }
+
+    // If the pg is currently destroyed or not yet alive (e.g. baseline resync), skip it;
+    // add_gc_task will enforce this again at submission time as a safety guard.
+    // FIXME: if we want avoiding GC on certain PG/CHUNK, we might added here.
+    if (!m_hs_home_object->is_pg_alive(chunk->m_pg_id.value())) { return 0; }
 
     const auto total_blk_num = chunk->get_total_blks();
-    const auto gc_garbage_rate_threshold = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold);
-    bool should_gc = 100 * defrag_blk_num > total_blk_num * gc_garbage_rate_threshold;
+    const uint32_t ratio_pct = static_cast< uint32_t >((100 * defrag_blk_num) / total_blk_num);
 
     LOGDEBUGMOD(gcmgr,
-                "gc scan chunk_id={}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, should_gc={}",
-                chunk_id, chunk->get_used_blks(), chunk->available_blks(), total_blk_num, defrag_blk_num, should_gc);
+                "gc scan chunk_id={}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, "
+                "garbage_ratio_pct={}",
+                chunk_id, chunk->get_used_blks(), chunk->available_blks(), total_blk_num, defrag_blk_num, ratio_pct);
 
-    return should_gc;
+    return ratio_pct;
 }
 
 void GCManager::scan_chunks_for_gc() {
     const auto reserved_chunk_num_per_pdev = HS_BACKEND_DYNAMIC_CONFIG(reserved_chunk_num_per_pdev);
     const auto reserved_chunk_num_per_pdev_for_egc = HS_BACKEND_DYNAMIC_CONFIG(reserved_chunk_num_per_pdev_for_egc);
+    const auto gc_thresh_high = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold);
+    auto gc_thresh_low = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold_low);
+
+    DEBUG_ASSERT(gc_thresh_low <= gc_thresh_high,
+               "gc_garbage_rate_threshold_low({}) must be less than or equal to gc_garbage_rate_threshold({})",
+               gc_thresh_low, gc_thresh_high);
+    if (gc_thresh_low > gc_thresh_high) {
+        LOGERRORMOD(gcmgr,
+                    "gc_garbage_rate_threshold_low={} exceeds gc_garbage_rate_threshold={}, "
+                    "auto-correcting low to {}",
+                    gc_thresh_low, gc_thresh_high, gc_thresh_high / 2);
+        gc_thresh_low = gc_thresh_high / 2;
+    }
 
     for (const auto& [pdev_id, chunks] : m_chunk_selector->get_pdev_chunks()) {
-        auto max_task_num = 2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc);
+        const uint32_t max_task_num = 2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc);
+
         auto it = m_pdev_gc_actors.find(pdev_id);
         RELEASE_ASSERT(it != m_pdev_gc_actors.end(), "can not find gc actor for pdev_id {} when scanning chunks for gc",
                        pdev_id);
         auto& actor = it->second;
 
+        // Compute remaining capacity against the true cross-scan quota.
+        // m_pending_normal_gc_task_count tracks all tasks currently queued or running in m_gc_executor,
+        // not just tasks submitted by this scan cycle. This prevents unbounded queue growth across scans.
+        const uint32_t already_pending = actor->get_pending_normal_task_count();
+        if (already_pending >= max_task_num) {
+            LOGINFOMOD(gcmgr,
+                       "pdev_id={} already has {}/{} pending normal gc tasks, skipping submission this scan cycle",
+                       pdev_id, already_pending, max_task_num);
+            continue;
+        }
+        const uint32_t remaining_capacity = max_task_num - already_pending;
+        // Low-tier chunks (below high watermark) may consume at most half the remaining capacity so
+        // that high-tier chunks always get priority when quota is tight.
+        const uint32_t low_tier_cap = remaining_capacity / 2;
+
+        // Collect at most max_task_num chunks with the highest garbage ratios via a bounded
+        // min-heap. K = max_task_num (fixed during the scan) rather than remaining_capacity
+        // (which may shrink/grow as tasks queue/complete) so we always have enough candidates
+        // ready if capacity opens up later. Submission is still gated by the dynamic
+        // remaining_capacity / low_tier_cap below. Chunks at or below gc_thresh_low are not
+        // worth scheduling and are dropped during collection.
+        struct ChunkGCInfo {
+            chunk_id_t chunk_id;
+            // integer percentage [0,100]; computed as (100*defrag_blks)/total_blks
+            uint32_t garbage_ratio_pct;
+        };
+        auto min_heap_cmp = [](const ChunkGCInfo& a, const ChunkGCInfo& b) {
+            return a.garbage_ratio_pct > b.garbage_ratio_pct;
+        };
+        std::priority_queue< ChunkGCInfo, std::vector< ChunkGCInfo >, decltype(min_heap_cmp) > top_k(min_heap_cmp);
         for (const auto& chunk_id : chunks) {
-            if (is_eligible_for_gc(chunk_id)) {
-                auto future = actor->add_gc_task(static_cast< uint8_t >(task_priority::normal), chunk_id);
-                if (future.isReady()) {
-                    if (future.value()) {
-                        LOGINFOMOD(
-                            gcmgr,
-                            "gc task for chunk_id={} on pdev_id={} has been submitted and successfully completed "
-                            "shortly",
-                            chunk_id, pdev_id);
-                    } else {
-                        LOGWARNMOD(gcmgr,
-                                   "got false after add_gc_task for chunk_id={} on pdev_id={}, it means we cannot mark "
-                                   "this chunk to gc state(there is an open shard on this chunk ATM) or this task is "
-                                   "executed shortly but fails(fail to copy data or update gc index table) ",
-                                   chunk_id, pdev_id);
-                    }
-                } else if (0 == --max_task_num) {
-                    LOGINFOMOD(gcmgr, "reached max gc task limit for pdev_id={}, stopping further gc task submissions",
-                               pdev_id);
-                    break;
+            const uint32_t ratio_pct = get_chunk_gc_ratio(chunk_id);
+            if (ratio_pct <= gc_thresh_low) { continue; }
+            if (top_k.size() < max_task_num) {
+                top_k.push({chunk_id, ratio_pct});
+            } else if (ratio_pct > top_k.top().garbage_ratio_pct) {
+                top_k.pop();
+                top_k.push({chunk_id, ratio_pct});
+            }
+        }
+
+        // Drain the min-heap into a presized vector, writing back-to-front: the heap pops in
+        // ascending ratio order, so placing each popped element at the current trailing index
+        // yields a descending-by-ratio sequence directly — no separate reverse pass needed.
+        std::vector< ChunkGCInfo > eligible(top_k.size());
+        for (size_t i = eligible.size(); i > 0; --i) {
+            eligible[i - 1] = top_k.top();
+            top_k.pop();
+        }
+
+        // Submit GC tasks respecting a two-tier quota within the remaining capacity:
+        //   - high-tier (ratio > gc_thresh_high): can consume the full remaining_capacity
+        //   - low-tier  (gc_thresh_low < ratio <= gc_thresh_high): capped at remaining_capacity/2
+        // The descending traversal guarantees all high-tier chunks are submitted before low-tier ones.
+        uint32_t newly_submitted = 0;
+        uint32_t low_tier_submitted = 0;
+        for (const auto& info : eligible) {
+            if (newly_submitted >= remaining_capacity) { break; }
+
+            const bool is_high_tier = (info.garbage_ratio_pct > gc_thresh_high);
+            if (!is_high_tier && low_tier_submitted >= low_tier_cap) { continue; }
+
+            auto future = actor->add_gc_task(static_cast< uint8_t >(task_priority::normal), info.chunk_id);
+            if (future.isReady()) {
+                if (future.value()) {
+                    LOGINFOMOD(gcmgr,
+                               "gc task for chunk_id={} on pdev_id={} has been submitted and successfully completed "
+                               "shortly",
+                               info.chunk_id, pdev_id);
+                } else {
+                    LOGWARNMOD(gcmgr,
+                               "got false after add_gc_task for chunk_id={} on pdev_id={}, it means we cannot mark "
+                               "this chunk to gc state(there is an open shard on this chunk ATM) or this task is "
+                               "executed shortly but fails(fail to copy data or update gc index table) ",
+                               info.chunk_id, pdev_id);
                 }
+            } else {
+                ++newly_submitted;
+                if (!is_high_tier) { ++low_tier_submitted; }
+                LOGINFOMOD(gcmgr,
+                           "submitted gc task for chunk_id={} on pdev_id={}, garbage_ratio_pct={}, tier={}, "
+                           "newly_submitted={}/{}, low_tier_submitted={}/{}, total_pending={}",
+                           info.chunk_id, pdev_id, info.garbage_ratio_pct, is_high_tier ? "high" : "low",
+                           newly_submitted, remaining_capacity, low_tier_submitted, low_tier_cap,
+                           already_pending + newly_submitted);
             }
         }
+        if (newly_submitted > 0) {
+            LOGINFOMOD(gcmgr,
+                       "pdev_id={} scan complete: submitted {} new gc tasks (total pending now ~{}), "
+                       "low_tier={}, remaining_capacity was {}",
+                       pdev_id, newly_submitted, already_pending + newly_submitted, low_tier_submitted,
+                       remaining_capacity);
+        }
     }
 }
 
@@ -384,6 +476,9 @@ folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task(uint8_t priority
                 process_gc_task(move_from_chunk, priority, std::move(promise), gc_task_id);
             });
         } else {
+            // Increment BEFORE handing the task to the executor so that an immediately-completing
+            // task (which decrements via ~gc_task_guard) cannot underflow the counter.
+            m_pending_normal_gc_task_count.fetch_add(1, std::memory_order_relaxed);
             m_gc_executor->add([this, gc_task_id, priority, move_from_chunk, promise = std::move(promise)]() mutable {
                 LOGDEBUGMOD(gcmgr, "start gc task : move_from_chunk_id={}, priority={}", move_from_chunk, priority);
                 process_gc_task(move_from_chunk, priority, std::move(promise), gc_task_id);
@@ -1261,6 +1356,11 @@ void GCManager::pdev_gc_actor::process_gc_task(chunk_id_t move_from_chunk, uint8
     if (vchunk->m_state != ChunkState::GC) {
         GCLOGW(task_id, pg_id, NO_SHARD_ID, "move_from_chunk={} is expected to in GC state but not!", move_from_chunk);
         task.setValue(false);
+        // We return before constructing gc_task_guard, so we must mirror its bookkeeping here:
+        // decrement the per-pdev pending normal-priority counter that add_gc_task incremented.
+        if (priority == static_cast< uint8_t >(task_priority::normal)) {
+            m_pending_normal_gc_task_count.fetch_sub(1, std::memory_order_relaxed);
+        }
         m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id);
         return;
     }
@@ -1426,6 +1526,9 @@ GCManager::pdev_gc_actor::~pdev_gc_actor() {
 GCManager::pdev_gc_actor::gc_task_guard::~gc_task_guard() {
     m_gc_actor->on_gc_task_completed(priority, pg_id, move_from_chunk, move_to_chunk, vchunk_id, success, task_id);
     task.setValue(success);
+    if (priority == static_cast< uint8_t >(task_priority::normal)) {
+        m_gc_actor->m_pending_normal_gc_task_count.fetch_sub(1, std::memory_order_relaxed);
+    }
     m_gc_actor->m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id);
 }