diff --git a/conanfile.py b/conanfile.py index 009c6b54..2e4f6992 100644 --- a/conanfile.py +++ b/conanfile.py @@ -10,7 +10,7 @@ class HomeObjectConan(ConanFile): name = "homeobject" - version = "4.1.19" + version = "4.1.20" homepage = "https://github.com/eBay/HomeObject" description = "Blob Store built on HomeStore" @@ -51,7 +51,7 @@ def build_requirements(self): def requirements(self): self.requires("sisl/[^13.2]", transitive_headers=True) - self.requires("homestore/[^7.5.2]") + self.requires("homestore/[^7.5.10]") def validate(self): if self.info.settings.compiler.cppstd: diff --git a/docs/adr/gc-priority-scheduling.md b/docs/adr/gc-priority-scheduling.md new file mode 100644 index 00000000..02e01ca7 --- /dev/null +++ b/docs/adr/gc-priority-scheduling.md @@ -0,0 +1,93 @@ +# GC Priority Scheduling with Dual Watermarks + +**Status**: Accepted +**Date**: 2026-06-17 + +--- + +## Context + +The GC manager periodically scans all chunks per pdev and submits GC tasks for any chunk whose +garbage ratio (defragmented blocks / total blocks) exceeds a single threshold +(`gc_garbage_rate_threshold`, default 50%). Tasks are submitted in arbitrary iteration order until a +per-pdev quota (`max_task_num = 2 × (reserved_chunks − egc_reserved)`) is reached. + +This design has two weaknesses: + +1. **No priority ordering.** A chunk at 51% garbage and a chunk at 95% garbage are treated + identically. When the quota is exhausted, high-garbage chunks may be skipped in favour of + lower-garbage ones that happen to appear first in iteration order. + +2. **Single threshold.** The threshold must be set conservatively high (50%) to avoid saturating + the quota with marginally-dirty chunks, leaving genuinely dirty chunks under-served. + +--- + +## Decision + +### 1. Priority-based scheduling (top-K by garbage ratio, descending) + +Before submitting any tasks, `scan_chunks_for_gc` now selects the top-K most garbage-heavy chunks +per pdev using a bounded min-heap (`std::priority_queue`) of capacity `K = max_task_num`. After +collection, the heap is drained into a vector and reversed so submission proceeds in descending +garbage-ratio order — the chunks with the most garbage are always scheduled first. This maximises +reclaimed space per GC cycle and makes scheduling deterministic: given the same chunk state, a +leader and its followers will produce an identical submission order. + +`K` is fixed at `max_task_num` (`2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc)`) +rather than the dynamic `remaining_capacity` so we always retain enough candidates if capacity +opens up later during the scan. Memory is bounded to O(K) regardless of how many chunks the pdev +holds. + +### 2. Dual watermarks + +Two configurable thresholds replace the single threshold: + +| Config key | Default | Meaning | +|---|---|---| +| `gc_garbage_rate_threshold` | 50% | **High watermark.** Chunks above this threshold are *high-tier* and can consume the full task quota. | +| `gc_garbage_rate_threshold_low` | 30% | **Low watermark.** Chunks between the two watermarks are *low-tier* and are capped at half the task quota (`max_task_num / 2`). | + +The submission loop enforces these invariants: + +``` +total_quota = max_task_num (hard cap, both tiers combined) +low_quota = max_task_num / 2 (cap for low-tier chunks only) + +for chunk in top_k_chunks iterated DESC by garbage_ratio: + if total_submitted >= total_quota → stop + if chunk is low-tier AND low_tier_submitted >= low_quota → skip + submit; update counters +``` + +Because chunks are iterated in descending order, all high-tier chunks are submitted before any +low-tier ones, so the low-tier cap never blocks a more-urgent chunk. + +Both thresholds are marked `hotswap` and can be adjusted at runtime without restart. + +--- + +## Consequences + +### Leader–Follower GC Synchronisation + +A key benefit of this change is improved synchronisation between the leader and its followers. +Under Raft replication, all replicas apply the same writes and deletions, so the garbage ratio of +any given vchunk will converge to the same value across all replicas. With priority-based +scheduling, the leader and followers will independently reach the same conclusion about which +chunks to GC and in what order — they see the same dirty state and rank chunks identically. + +This synchronisation matters because emergent GC (eGC) is triggered when a replica runs out of +free space and must GC under pressure. If regular GC runs consistently on the dirtiest chunks +first, space is reclaimed more efficiently, and replicas are less likely to diverge in their free +space inventory. The result is fewer eGC events and lower tail latency for normal I/O. + +### Operational impact + +- **Lower effective threshold.** Because dirty chunks are prioritised, the high watermark can be + reduced over time (e.g., from 50% to 40%) without increasing the number of tasks per cycle. +- **Coarse throttle for marginally-dirty chunks.** The low-tier cap ensures that chunks just above + the low watermark do not crowd out more valuable work when the quota is tight. +- **Backward compatibility.** The existing `gc_garbage_rate_threshold` field retains its default + value (50%) and its role; only its semantics are clarified as the *high* watermark. No on-disk + format changes are required. diff --git a/src/lib/homestore_backend/CMakeLists.txt b/src/lib/homestore_backend/CMakeLists.txt index 7fd3d6fe..fd551605 100644 --- a/src/lib/homestore_backend/CMakeLists.txt +++ b/src/lib/homestore_backend/CMakeLists.txt @@ -155,6 +155,8 @@ add_test(NAME FetchDataWithOriginatorGC COMMAND homestore_test_dynamic -csv error --executor immediate --config_path ./ --override_config hs_backend_config.enable_gc=true --override_config hs_backend_config.gc_enable_read_verify=true + --override_config hs_backend_config.gc_garbage_rate_threshold=50 + --override_config hs_backend_config.gc_garbage_rate_threshold_low=30 --gtest_filter=HomeObjectFixture.FetchDataWithOriginatorGC) add_executable(homestore_test_gc) @@ -163,5 +165,6 @@ target_link_libraries(homestore_test_gc PUBLIC homeobject_homestore ${COMMON_TES add_test(NAME HomestoreTestGC COMMAND homestore_test_gc -csv error --executor immediate --config_path ./ --override_config hs_backend_config.enable_gc=true --override_config hs_backend_config.gc_enable_read_verify=true - --override_config hs_backend_config.gc_garbage_rate_threshold=0 + --override_config hs_backend_config.gc_garbage_rate_threshold=0 + --override_config hs_backend_config.gc_garbage_rate_threshold_low=0 --override_config hs_backend_config.gc_scan_interval_sec=5) diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp index c93eb43d..91233fc7 100644 --- a/src/lib/homestore_backend/gc_manager.cpp +++ b/src/lib/homestore_backend/gc_manager.cpp @@ -1,3 +1,6 @@ +#include +#include + #include #include @@ -220,64 +223,153 @@ std::shared_ptr< GCManager::pdev_gc_actor > GCManager::get_pdev_gc_actor(uint32_ return it->second; } -bool GCManager::is_eligible_for_gc(chunk_id_t chunk_id) { +uint32_t GCManager::get_chunk_gc_ratio(chunk_id_t chunk_id) { auto chunk = m_chunk_selector->get_extend_vchunk(chunk_id); + + // Only AVAILABLE chunks are eligible: INUSE means an open shard owns it, GC means already being processed. + if (chunk->m_state != ChunkState::AVAILABLE) { return 0; } + const auto defrag_blk_num = chunk->get_defrag_nblks(); - if (!defrag_blk_num) { return false; } + if (!defrag_blk_num) { return 0; } - // 1 if the chunk state is inuse, it is occupied by a open shard, so it can not be selected and we don't need gc it. - // 2 if the chunk state is gc, it means this chunk is being gc, or this is a reserved chunk, so we don't need gc it. - if (chunk->m_state != ChunkState::AVAILABLE) { - LOGDEBUGMOD(gcmgr, "chunk_id={} state is {}, not eligible for gc", chunk_id, chunk->m_state) - return false; - } + // Chunks with no pg assignment are unowned and do not need GC. + if (!chunk->m_pg_id.has_value()) { return 0; } + + // If the pg is currently destroyed or not yet alive (e.g. baseline resync), skip it; + // add_gc_task will enforce this again at submission time as a safety guard. + // FIXME: if we want avoiding GC on certain PG/CHUNK, we might added here. + if (!m_hs_home_object->is_pg_alive(chunk->m_pg_id.value())) { return 0; } const auto total_blk_num = chunk->get_total_blks(); - const auto gc_garbage_rate_threshold = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold); - bool should_gc = 100 * defrag_blk_num > total_blk_num * gc_garbage_rate_threshold; + const uint32_t ratio_pct = static_cast< uint32_t >((100 * defrag_blk_num) / total_blk_num); LOGDEBUGMOD(gcmgr, - "gc scan chunk_id={}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, should_gc={}", - chunk_id, chunk->get_used_blks(), chunk->available_blks(), total_blk_num, defrag_blk_num, should_gc); + "gc scan chunk_id={}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, " + "garbage_ratio_pct={}", + chunk_id, chunk->get_used_blks(), chunk->available_blks(), total_blk_num, defrag_blk_num, ratio_pct); - return should_gc; + return ratio_pct; } void GCManager::scan_chunks_for_gc() { const auto reserved_chunk_num_per_pdev = HS_BACKEND_DYNAMIC_CONFIG(reserved_chunk_num_per_pdev); const auto reserved_chunk_num_per_pdev_for_egc = HS_BACKEND_DYNAMIC_CONFIG(reserved_chunk_num_per_pdev_for_egc); + const auto gc_thresh_high = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold); + auto gc_thresh_low = HS_BACKEND_DYNAMIC_CONFIG(gc_garbage_rate_threshold_low); + + DEBUG_ASSERT(gc_thresh_low <= gc_thresh_high, + "gc_garbage_rate_threshold_low({}) must be less than or equal to gc_garbage_rate_threshold({})", + gc_thresh_low, gc_thresh_high); + if (gc_thresh_low > gc_thresh_high) { + LOGERRORMOD(gcmgr, + "gc_garbage_rate_threshold_low={} exceeds gc_garbage_rate_threshold={}, " + "auto-correcting low to {}", + gc_thresh_low, gc_thresh_high, gc_thresh_high / 2); + gc_thresh_low = gc_thresh_high / 2; + } for (const auto& [pdev_id, chunks] : m_chunk_selector->get_pdev_chunks()) { - auto max_task_num = 2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc); + const uint32_t max_task_num = 2 * (reserved_chunk_num_per_pdev - reserved_chunk_num_per_pdev_for_egc); + auto it = m_pdev_gc_actors.find(pdev_id); RELEASE_ASSERT(it != m_pdev_gc_actors.end(), "can not find gc actor for pdev_id {} when scanning chunks for gc", pdev_id); auto& actor = it->second; + // Compute remaining capacity against the true cross-scan quota. + // m_pending_normal_gc_task_count tracks all tasks currently queued or running in m_gc_executor, + // not just tasks submitted by this scan cycle. This prevents unbounded queue growth across scans. + const uint32_t already_pending = actor->get_pending_normal_task_count(); + if (already_pending >= max_task_num) { + LOGINFOMOD(gcmgr, + "pdev_id={} already has {}/{} pending normal gc tasks, skipping submission this scan cycle", + pdev_id, already_pending, max_task_num); + continue; + } + const uint32_t remaining_capacity = max_task_num - already_pending; + // Low-tier chunks (below high watermark) may consume at most half the remaining capacity so + // that high-tier chunks always get priority when quota is tight. + const uint32_t low_tier_cap = remaining_capacity / 2; + + // Collect at most max_task_num chunks with the highest garbage ratios via a bounded + // min-heap. K = max_task_num (fixed during the scan) rather than remaining_capacity + // (which may shrink/grow as tasks queue/complete) so we always have enough candidates + // ready if capacity opens up later. Submission is still gated by the dynamic + // remaining_capacity / low_tier_cap below. Chunks at or below gc_thresh_low are not + // worth scheduling and are dropped during collection. + struct ChunkGCInfo { + chunk_id_t chunk_id; + // integer percentage [0,100]; computed as (100*defrag_blks)/total_blks + uint32_t garbage_ratio_pct; + }; + auto min_heap_cmp = [](const ChunkGCInfo& a, const ChunkGCInfo& b) { + return a.garbage_ratio_pct > b.garbage_ratio_pct; + }; + std::priority_queue< ChunkGCInfo, std::vector< ChunkGCInfo >, decltype(min_heap_cmp) > top_k(min_heap_cmp); for (const auto& chunk_id : chunks) { - if (is_eligible_for_gc(chunk_id)) { - auto future = actor->add_gc_task(static_cast< uint8_t >(task_priority::normal), chunk_id); - if (future.isReady()) { - if (future.value()) { - LOGINFOMOD( - gcmgr, - "gc task for chunk_id={} on pdev_id={} has been submitted and successfully completed " - "shortly", - chunk_id, pdev_id); - } else { - LOGWARNMOD(gcmgr, - "got false after add_gc_task for chunk_id={} on pdev_id={}, it means we cannot mark " - "this chunk to gc state(there is an open shard on this chunk ATM) or this task is " - "executed shortly but fails(fail to copy data or update gc index table) ", - chunk_id, pdev_id); - } - } else if (0 == --max_task_num) { - LOGINFOMOD(gcmgr, "reached max gc task limit for pdev_id={}, stopping further gc task submissions", - pdev_id); - break; + const uint32_t ratio_pct = get_chunk_gc_ratio(chunk_id); + if (ratio_pct <= gc_thresh_low) { continue; } + if (top_k.size() < max_task_num) { + top_k.push({chunk_id, ratio_pct}); + } else if (ratio_pct > top_k.top().garbage_ratio_pct) { + top_k.pop(); + top_k.push({chunk_id, ratio_pct}); + } + } + + // Drain the min-heap into a presized vector, writing back-to-front: the heap pops in + // ascending ratio order, so placing each popped element at the current trailing index + // yields a descending-by-ratio sequence directly — no separate reverse pass needed. + std::vector< ChunkGCInfo > eligible(top_k.size()); + for (size_t i = eligible.size(); i > 0; --i) { + eligible[i - 1] = top_k.top(); + top_k.pop(); + } + + // Submit GC tasks respecting a two-tier quota within the remaining capacity: + // - high-tier (ratio > gc_thresh_high): can consume the full remaining_capacity + // - low-tier (gc_thresh_low < ratio <= gc_thresh_high): capped at remaining_capacity/2 + // The descending traversal guarantees all high-tier chunks are submitted before low-tier ones. + uint32_t newly_submitted = 0; + uint32_t low_tier_submitted = 0; + for (const auto& info : eligible) { + if (newly_submitted >= remaining_capacity) { break; } + + const bool is_high_tier = (info.garbage_ratio_pct > gc_thresh_high); + if (!is_high_tier && low_tier_submitted >= low_tier_cap) { continue; } + + auto future = actor->add_gc_task(static_cast< uint8_t >(task_priority::normal), info.chunk_id); + if (future.isReady()) { + if (future.value()) { + LOGINFOMOD(gcmgr, + "gc task for chunk_id={} on pdev_id={} has been submitted and successfully completed " + "shortly", + info.chunk_id, pdev_id); + } else { + LOGWARNMOD(gcmgr, + "got false after add_gc_task for chunk_id={} on pdev_id={}, it means we cannot mark " + "this chunk to gc state(there is an open shard on this chunk ATM) or this task is " + "executed shortly but fails(fail to copy data or update gc index table) ", + info.chunk_id, pdev_id); } + } else { + ++newly_submitted; + if (!is_high_tier) { ++low_tier_submitted; } + LOGINFOMOD(gcmgr, + "submitted gc task for chunk_id={} on pdev_id={}, garbage_ratio_pct={}, tier={}, " + "newly_submitted={}/{}, low_tier_submitted={}/{}, total_pending={}", + info.chunk_id, pdev_id, info.garbage_ratio_pct, is_high_tier ? "high" : "low", + newly_submitted, remaining_capacity, low_tier_submitted, low_tier_cap, + already_pending + newly_submitted); } } + if (newly_submitted > 0) { + LOGINFOMOD(gcmgr, + "pdev_id={} scan complete: submitted {} new gc tasks (total pending now ~{}), " + "low_tier={}, remaining_capacity was {}", + pdev_id, newly_submitted, already_pending + newly_submitted, low_tier_submitted, + remaining_capacity); + } } } @@ -384,6 +476,9 @@ folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task(uint8_t priority process_gc_task(move_from_chunk, priority, std::move(promise), gc_task_id); }); } else { + // Increment BEFORE handing the task to the executor so that an immediately-completing + // task (which decrements via ~gc_task_guard) cannot underflow the counter. + m_pending_normal_gc_task_count.fetch_add(1, std::memory_order_relaxed); m_gc_executor->add([this, gc_task_id, priority, move_from_chunk, promise = std::move(promise)]() mutable { LOGDEBUGMOD(gcmgr, "start gc task : move_from_chunk_id={}, priority={}", move_from_chunk, priority); process_gc_task(move_from_chunk, priority, std::move(promise), gc_task_id); @@ -1261,6 +1356,11 @@ void GCManager::pdev_gc_actor::process_gc_task(chunk_id_t move_from_chunk, uint8 if (vchunk->m_state != ChunkState::GC) { GCLOGW(task_id, pg_id, NO_SHARD_ID, "move_from_chunk={} is expected to in GC state but not!", move_from_chunk); task.setValue(false); + // We return before constructing gc_task_guard, so we must mirror its bookkeeping here: + // decrement the per-pdev pending normal-priority counter that add_gc_task incremented. + if (priority == static_cast< uint8_t >(task_priority::normal)) { + m_pending_normal_gc_task_count.fetch_sub(1, std::memory_order_relaxed); + } m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id); return; } @@ -1426,6 +1526,9 @@ GCManager::pdev_gc_actor::~pdev_gc_actor() { GCManager::pdev_gc_actor::gc_task_guard::~gc_task_guard() { m_gc_actor->on_gc_task_completed(priority, pg_id, move_from_chunk, move_to_chunk, vchunk_id, success, task_id); task.setValue(success); + if (priority == static_cast< uint8_t >(task_priority::normal)) { + m_gc_actor->m_pending_normal_gc_task_count.fetch_sub(1, std::memory_order_relaxed); + } m_gc_actor->m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id); } diff --git a/src/lib/homestore_backend/gc_manager.hpp b/src/lib/homestore_backend/gc_manager.hpp index f85d9718..90b6a2a4 100644 --- a/src/lib/homestore_backend/gc_manager.hpp +++ b/src/lib/homestore_backend/gc_manager.hpp @@ -246,6 +246,12 @@ class GCManager { void stop(); uint32_t get_pdev_id() const { return m_pdev_id; } + // Returns the number of normal-priority GC tasks that are currently queued or running in + // m_gc_executor. Used by scan_chunks_for_gc to enforce a cross-scan quota cap. + uint32_t get_pending_normal_task_count() const { + return m_pending_normal_gc_task_count.load(std::memory_order_relaxed); + } + private: void process_gc_task(chunk_id_t move_from_chunk, uint8_t priority, folly::Promise< bool > task, const uint64_t task_id); @@ -312,6 +318,10 @@ class GCManager { std::shared_ptr< folly::IOThreadPoolExecutor > m_gc_executor; std::shared_ptr< folly::IOThreadPoolExecutor > m_egc_executor; std::atomic_bool m_is_stopped{true}; + // Tracks normal-priority GC tasks that are queued or actively running in m_gc_executor. + // Incremented in add_gc_task after a task is enqueued; decremented in on_gc_task_completed. + // Used by scan_chunks_for_gc to enforce a true cross-scan quota cap. + std::atomic< uint32_t > m_pending_normal_gc_task_count{0}; // since we have a very small number of reserved chunks, a vector is enough // TODO:: use a map if we have a large number of reserved chunks std::vector< homestore::superblk< GCManager::gc_reserved_chunk_superblk > > m_reserved_chunks; @@ -341,7 +351,10 @@ class GCManager { std::shared_ptr< pdev_gc_actor > try_create_pdev_gc_actor(uint32_t pdev_id, const homestore::superblk< GCManager::gc_actor_superblk >& gc_actor_sb); - bool is_eligible_for_gc(chunk_id_t chunk_id); + // Returns the garbage ratio percentage (0-100) for the given chunk if it is a valid GC candidate, + // or 0 if the chunk is not eligible (wrong state, no defrag blks, no pg, or pg not gc-able). + // Callers compare the returned ratio against their own threshold. + uint32_t get_chunk_gc_ratio(chunk_id_t chunk_id); void handle_all_recovered_gc_tasks(); diff --git a/src/lib/homestore_backend/hs_backend_config.fbs b/src/lib/homestore_backend/hs_backend_config.fbs index 1cc085d5..3619936a 100644 --- a/src/lib/homestore_backend/hs_backend_config.fbs +++ b/src/lib/homestore_backend/hs_backend_config.fbs @@ -35,8 +35,11 @@ table HSBackendSettings { //GC scan interval(second) gc_scan_interval_sec: uint64 = 60; - //GC garbage rate threshold, upon which a chunk will be selected for gc - gc_garbage_rate_threshold: uint8 = 50; + //GC garbage rate high watermark: chunks above this threshold use the full GC task quota + gc_garbage_rate_threshold: uint8 = 50 (hotswap); + + //GC garbage rate low watermark: chunks above this but below the high watermark use half the GC task quota + gc_garbage_rate_threshold_low: uint8 = 30 (hotswap); //enable read verify when gc is copying data gc_enable_read_verify: bool = true;